var data = [{"loc": [4.315114974975586, 4.121492385864258], "id": 1321, "title": "A Neural-Symbolic Approach to Natural Language Understanding", "authors": "Zhixuan Liu, Zihao Wang, Yuan Lin and Hang Li", "abstract": "Deep neural networks, empowered by pre-trained language models, have achieved remarkable results in natural language understanding (NLU) tasks. However, their performances can drastically deteriorate when logical reasoning is needed. This is because NLU in principle depends on not only analogical reasoning, which deep neural networks are good at, but also logical reasoning. According to the dual-process theory, analogical reasoning and logical reasoning are respectively carried out by System 1 and System 2 in the human brain. Inspired by the theory, we present a novel framework for NLU called Neural-Symbolic Processor (NSP), which performs analogical reasoning based on neural processing and logical reasoning based on both neural and symbolic processing. As a case study, we conduct experiments on two NLU tasks, question answering (QA) and natural language inference (NLI), when numerical reasoning (a type of logical reasoning) is necessary. The experimental results show that our method significantly outperforms state-of-the-art methods in both tasks.", "track": "Ethic Concerns:Efficient Methods for NLP", "label": 12}, {"loc": [5.696519374847412, 6.273229598999023], "id": 1810, "title": "Towards Intelligent Clinically-Informed Language Analyses of People with Bipolar Disorder and Schizophrenia", "authors": "Ankit Aich, Avery Quynh, Varsha Badal, Amy Pinkham, Philip Harvey, Colin Depp and Natalie Parde", "abstract": "NLP offers a myriad of opportunities to support mental health research. However, prior work has almost exclusively focused on social media data, for which diagnoses are difficult or impossible to validate. We present a first-of-its-kind dataset of manually transcribed interactions with people clinically diagnosed with bipolar disorder and schizophrenia, as well as healthy controls. Data was collected through validated clinical tasks and paired with diagnostic measures. We extract 100+ temporal, sentiment, psycholinguistic, emotion, and lexical features from the data and establish classification validity using a variety of models to study language differences between diagnostic groups. Our models achieve strong classification performance (maximum F1=0.93-0.96), and lead to the discovery of interesting associations between linguistic features and diagnostic class. It is our hope that this dataset will offer high value to clinical and NLP researchers, with potential for widespread broader impacts.", "track": "Ethic Concerns:Resources and Evaluation", "label": 1}, {"loc": [4.698927402496338, 6.977048397064209], "id": 2293, "title": "Towards Identifying Social Bias in Dialog Systems: Framework, Dataset, and Benchmark", "authors": "Jingyan ZHOU, Jiawen Deng, Fei Mi, Yitong Li, Yasheng Wang, Minlie Huang, Xin Jiang, Qun Liu and Helen Meng", "abstract": "Among all the safety concerns that hinder the deployment of open-domain dialog systems (e.g., offensive languages, biases, and toxic behaviors), social bias presents an insidious challenge. Addressing this challenge requires rigorous analyses and normative reasoning. In this paper, we focus our investigation on social bias measurement to facilitate the development of unbiased dialog systems. We first propose a novel Dial-Bias Framework for analyzing the social bias in conversations using a holistic method beyond bias lexicons or dichotomous annotations. Leveraging the proposed framework, we further introduce the CDial-Bias Dataset which is, to the best of our knowledge, the first annotated Chinese social bias dialog dataset. We also establish a fine-grained dialog bias measurement benchmark and conduct in-depth ablation studies to shed light on the utility of the detailed annotations in the proposed dataset. Finally, we evaluate representative Chinese generative models with our classifiers to unveil the presence of social bias in these systems.", "track": "Ethic Concerns:Ethics", "label": 21}, {"loc": [4.80926513671875, 8.684630393981934], "id": 3479, "title": "Cards Against AI: Predicting Humor in a Fill-in-the-blank Party Game", "authors": "Dan Ofer and Dafna Shahaf", "abstract": "Humor is an inherently social phenomenon, with humorous utterances shaped by what is socially and culturally accepted. Understanding humor is an important NLP challenge, with many applications to human-computer interactions. In this work we explore humor in the context of Cards Against Humanity -- a party game where players complete fill-in-the-blank statements using cards that can be offensive or politically incorrect.\n\nWe introduce a novel dataset of 300,000 online games of Cards Against Humanity, including 785K unique jokes, analyze it and provide insights. We trained machine learning models to predict the winning joke per game, achieving performance twice as good (20%) as random, even without any user information.\nOn the more difficult task of judging novel cards, we see the models' ability to generalize is moderate. \nInterestingly, we find that our models are primarily focused on punchline card, with the context having little impact.\nAnalyzing feature importance, we observe that short, crude, juvenile punchlines tend to win.", "track": "Ethic Concerns:Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [2.7227730751037598, 8.804580688476562], "id": 3506, "title": "ClinicalT5: A Generative Language Model for Clinical Text", "authors": "Qiuhao Lu, Dejing Dou and Thien Huu Nguyen", "abstract": "In the past few years, large pre-trained language models (PLMs) have been widely adopted in different areas and have made fundamental improvements over a variety of downstream tasks in natural language processing (NLP). Meanwhile, domain-specific variants of PLMs are being proposed to address the needs of domains that demonstrate a specific pattern of writing and vocabulary, e.g., BioBERT for the biomedical domain and ClinicalBERT for the clinical domain. Recently, generative language models like BART and T5 are gaining popularity with their competitive performance on text generation as well as on tasks cast as generative problems. However, in the clinical domain, such domain-specific generative variants are still underexplored. To address this need, our work introduces a T5-based text-to-text transformer model pre-trained on clinical text, i.e., ClinicalT5. We evaluate the proposed model both intrinsically and extrinsically over a diverse set of tasks across multiple datasets, and show that ClinicalT5 dramatically outperforms T5 in the domain-specific tasks and compares favorably with its close baselines.", "track": "Ethic Concerns:Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.135379314422607, 12.309901237487793], "id": 2082, "title": "How well can Text-to-Image Generative Models understand Ethical Natural Language Interventions?", "authors": "Hritik Bansal, Da Yin, Masoud Monajatipoor and Kai-Wei Chang", "abstract": "Text-to-image generative models have achieved unprecedented success in generating high-quality images based on natural language descriptions. However, it is shown that these models tend to favor specific social groups when prompted with neutral text descriptions (e.g., `a photo of a lawyer'). Following Zhao et al. (2021), we study the effect on the diversity of the generated images when adding \\textit{ethical intervention} that supports equitable judgment (e.g., `if all individuals can be a lawyer irrespective of their gender') in the input prompts. To this end, we introduce an Ethical NaTural Language Interventions in Text-to-Image GENeration (ENTIGEN) benchmark dataset to evaluate the change in image generations conditional on ethical interventions across three social axes -- gender, skin color, and culture. Through CLIP-based and human evaluation on minDALL.E, DALL.E-mini and Stable Diffusion, we find that the model generations cover diverse social groups while preserving the image quality. In some cases, the generations would be anti-stereotypical (e.g., models tend to create images with individuals that are perceived as man when fed with prompts about makeup) in the presence of ethical intervention. Preliminary studies indicate that a large change in the model predictions is triggered by certain phrases such as `irrespective of gender' in the context of gender bias in the ethical interventions. We release code and annotated data at https://github.com/Hritikbansal/entigen\\_emnlp.", "track": "Ethic Concerns:Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.785617828369141, 6.930328369140625], "id": 4156, "title": "D4: a Chinese Dialogue Dataset for Depression-Diagnosis-Oriented Chat", "authors": "Binwei Yao, Chao Shi, Likai Zou, Lingfeng Dai, Mengyue Wu, Lu Chen, Zhen Wang and Kai Yu", "abstract": "In a depression-diagnosis-directed clinical session, doctors initiate a conversation with ample emotional support that guides the patients to expose their symptoms based on clinical diagnosis criteria. Such a dialogue system is distinguished from existing single-purpose human-machine dialog systems, as it combines task-oriented and chit-chats with uniqueness in dialogue topics and procedures. However, due to the social stigma associated with mental illness, the dialogue data related to depression consultation and diagnosis are rarely disclosed. Based on clinical depression diagnostic criteria ICD-11 and DSM-5, we designed a 3-phase procedure to construct D$^4$: a Chinese Dialogue Dataset for Depression-Diagnosis-Oriented Chat, which simulates the dialogue between doctors and patients during the diagnosis of depression, including diagnosis results and symptom summary given by professional psychiatrists for each conversation. Upon the newly-constructed dataset, four tasks mirroring the depression diagnosis process are established: response generation, topic prediction, dialog summary, and severity classification of depressive episode and suicide risk. Multi-scale evaluation results demonstrate that a more empathy-driven and diagnostic-accurate consultation dialogue system trained on our dataset can be achieved compared to rule-based bots.", "track": "Ethic Concerns:Dialogue and Interactive Systems", "label": 4}, {"loc": [8.15729808807373, 2.99759840965271], "id": 2865, "title": "Quantifying Privacy Risks of Masked Language Models Using Membership Inference Attacks", "authors": "Fatemehsadat Mireshghallah, Kartik Goyal, Archit Uniyal, Taylor Berg-Kirkpatrick and Reza Shokri", "abstract": "The wide adoption and application of Masked language models~(MLMs) on sensitive data (from legal to medical) necessitates a thorough quantitative investigation into their privacy vulnerabilities. \nPrior attempts at measuring leakage of MLMs via membership inference attacks have been inconclusive, implying potential robustness of MLMs to privacy attacks.\nIn this work, we posit that prior attempts were inconclusive because they based their attack solely on the MLM's model score. We devise a stronger membership inference attack based on likelihood ratio hypothesis testing that involves an additional reference MLM to more accurately quantify the privacy risks of memorization in MLMs. We show that masked language models are indeed susceptible to likelihood ratio membership inference attacks: Our empirical results, on models trained on medical notes, show that our attack improves the AUC of prior membership inference attacks from $0.66$ to an alarmingly high $0.90$ level.", "track": "Ethic Concerns:Ethics", "label": 21}, {"loc": [4.040225982666016, 4.001230716705322], "id": 114, "title": "LogicSolver: Towards Interpretable Math Word Problem Solving with Logical Prompt-enhanced Learning", "authors": "Zhicheng Yang, Jinghui Qin, Jiaqi Chen, Liang Lin and Xiaodan Liang", "abstract": "Recently, deep learning models have made great progress in MWP solving on answer accuracy. However, they are uninterpretable since they mainly rely on shallow heuristics to achieve high performance without understanding and reasoning the grounded math logic. To address this issue and make a step towards interpretable MWP solving, we first construct a high-quality MWP dataset named InterMWP which consists of 11,495 MWPs and annotates interpretable logical formulas based on algebraic knowledge as the grounded linguistic logic of each solution equation. Different from existing MWP datasets, our InterMWP benchmark asks for a solver to not only output the solution expressions but also predict the corresponding logical formulas. We further propose a novel approach with logical prompt and interpretation generation, called LogicSolver. For each MWP, our LogicSolver first retrieves some highly-correlated algebraic knowledge and then passes them to the backbone model as prompts to improve the semantic representations of MWPs. With these improved semantic representations, our LogicSolver generates corresponding solution expressions and interpretable knowledge formulas in accord with the generated solution expressions, simultaneously. Experimental results show that our LogicSolver has stronger logical formula-based interpretability than baselines while achieving higher answer accuracy with the help of logical prompts, simultaneously. The source code and dataset will be available at https://github.com/yangzhch6/InterMWP.", "track": "NLP Applications", "label": 0}, {"loc": [4.834019184112549, 3.218323230743408], "id": 127, "title": "Commonsense Knowledge Salience Evaluation with a Benchmark Dataset in E-commerce", "authors": "Yincen Qu, Ningyu Zhang, Hui Chen, zelin Dai, Chengming Wang, Xiaoyu Wang, Qiang Chen and Huajun Chen", "abstract": "In e-commerce, the salience of commonsense knowledge (CSK) is beneficial for widespread applications such as product search and recommendation. For example, when users search for ``running'' in e-commerce, they would like to find products highly related to running, such as ``running shoes'' rather than ``shoes''. Nevertheless, many existing CSK collections rank statements solely by confidence scores, and there is no information about which ones are salient from a human perspective. In this work, we define the task of supervised salience evaluation, where given a CSK triple, the model is required to learn whether the triple is salient or not. In addition to formulating the new task, we also release a new Benchmark dataset of Salience Evaluation in E-commerce (BSEE) and hope to promote related research on commonsense knowledge salience evaluation. We conduct experiments in the dataset with several representative baseline models. The experimental results show that salience evaluation is a hard task where models perform poorly on our evaluation set. We further propose a simple but effective approach, PMI-tuning, which shows promise for solving this novel problem. Code is available in https://github.com/OpenBGBenchmark/OpenBG-CSK.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.9843170642852783, 8.041421890258789], "id": 138, "title": "Automatic Rule Induction for Efficient Semi-Supervised Learning", "authors": "Reid Pryzant, Ziyi Yang, Yichong Xu, Chenguang Zhu and Michael Zeng", "abstract": "Semi-supervised learning has shown promise in allowing NLP models to generalize from small amounts of labeled data. Meanwhile, pretrained transformer models act as black-box correlation engines that are difficult to explain and sometimes behave unreliably. In this paper, we propose tackling both of these challenges via Automatic Rule Induction (ARI), a simple and general-purpose framework for the automatic discovery and integration of symbolic rules into pretrained transformer models. First, we extract weak symbolic rules from low-capacity machine learning models trained on small amounts of labeled data. Next, we use an attention mechanism to integrate these rules into high-capacity pretrained transformer models. Last, the rule-augmented system becomes part of a self-training framework to boost supervision signal on unlabeled data. These steps can be layered beneath a variety of existing weak supervision and semi-supervised NLP algorithms in order to improve performance and interpretability. Experiments across nine sequence classification and relation extraction tasks suggest that ARI can improve state-of-the-art methods with no manual effort and minimal computational overhead.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.485325336456299, 6.756872177124023], "id": 143, "title": "Improving Semantic Matching through Dependency-Enhanced Pre-trained Model with Adaptive Fusion", "authors": "Jian Song, Di Liang, Rumei Li, Yuntao Li, Sirui Wang, Minlong Peng, Wei Wu and Yongxin Yu", "abstract": "Transformer-based pre-trained models like BERT have achieved great progress on Semantic Sentence Matching. Meanwhile, dependency prior knowledge has also shown general benefits in multiple NLP tasks. However, how to efficiently integrate dependency prior structure into pre-trained models to better model complex semantic matching relations is still unsettled. In this paper, we propose the Dependency-Enhanced Adaptive Fusion Attention (DAFA), which explicitly introduces dependency structure into pre-trained models and adaptively fuses it with semantic information. Specifically, (i) DAFA first proposes a structure-sensitive paradigm to construct a dependency matrix for calibrating attention weights. (ii) It adopts an adaptive fusion module to integrate the obtained dependency information and the original semantic signals. Moreover, DAFA reconstructs the attention calculation flow and provides better interpretability. By applying it on BERT, our method achieves state-of-the-art or competitive performance on 10 public datasets, demonstrating the benefits of adaptively fusing dependency structure in semantic matching task.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.616013526916504, 8.116327285766602], "id": 150, "title": "Sparse Mixers: Combining MoE and Mixing to build a more efficient BERT", "authors": "James Lee-Thorp and Joshua Ainslie", "abstract": "We combine the capacity of sparsely gated Mixture-of-Experts (MoE) with the speed and stability of linear, mixing transformations to design the Sparse Mixer encoder model. Sparse Mixer slightly outperforms BERT on GLUE and SuperGLUE, but more importantly trains 65% faster and runs inference 61% faster. We also present a faster variant, prosaically named Fast Sparse Mixer, that marginally underperforms BERT on SuperGLUE, but trains and runs nearly twice as fast. We justify the design of these two models by carefully ablating through various mixing mechanisms, MoE configurations, and hyperparameters. Sparse Mixer overcomes many of the latency and stability concerns of MoE models and offers the prospect of serving sparse student models, without resorting to distilling them to dense variants.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.743602275848389, 3.252457618713379], "id": 159, "title": "KE-GCL: Knowledge Enhanced Graph Contrastive Learning for Commonsense Question Answering", "authors": "Lihui Zhang and Ruifan Li", "abstract": "Commonsense question answering (CQA) aims to choose the correct answers for commonsense questions. Most existing works focus on extracting and reasoning over external knowledge graphs (KG). However, the noise in KG prevents these models from learning effective representations. In this paper, we propose a Knowledge Enhanced Graph Contrastive Learning model (KE-GCL) by incorporating the contextual descriptions of entities and adopting a graph contrastive learning scheme. Specifically, for QA pairs we represent the knowledge from KG and contextual descriptions. Then, the representations of contextual descriptions as context nodes are inserted into KG, forming the knowledge-enhanced graphs.\nMoreover, we design a contrastive learning method on graphs. For knowledge-enhanced graphs, we build their augmented views with an adaptive sampling strategy. After that, we reason over graphs to update their representations by scattering edges and aggregating nodes. To further improve GCL, hard graph negatives are chosen based on incorrect answers. Extensive experiments on two benchmark datasets demonstrate the effectiveness of our proposed KE-GCL, which outperforms previous methods consistently.", "track": "Question Answering", "label": 11}, {"loc": [6.9270710945129395, 6.083178520202637], "id": 161, "title": "Acceptability Judgements via Examining the Topology of Attention Maps", "authors": "Daniil Cherniavskii, Eduard Tulchinskii, Vladislav Mikhailov, Irina Proskurina, Laida Kushnareva, Ekaterina Artemova, Serguei Barannikov, Irina Piontkovskaya, Dmitri Piontkovski and Evgeny Burnaev", "abstract": "The role of the attention mechanism in encoding linguistic knowledge has received special interest in NLP. However, the ability of the attention heads to judge the grammatical acceptability of a sentence has been underexplored. This paper approaches the paradigm of acceptability judgments with topological data analysis (TDA), showing that the geometric properties of the attention graph can be efficiently exploited for two standard practices in linguistics: binary judgments and linguistic minimal pairs. Topological features enhance the BERT-based acceptability classifier scores by $8$\\%-$24$\\% on \\textsc{CoLA} in three languages (English, Italian, and Swedish). By revealing the topological discrepancy between attention maps of minimal pairs, we achieve the human-level performance on the \\textsc{BLiMP} benchmark, outperforming nine statistical and Transformer LM baselines. At the same time, TDA provides the foundation for analyzing the linguistic functions of attention heads and interpreting the correspondence between the graph features and grammatical phenomena. We publicly release the code and other materials used in the experiments.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.800222396850586, 9.005216598510742], "id": 170, "title": "Clip-Tuning: Towards Derivative-free Prompt Learning with a Mixture of Rewards", "authors": "Yekun Chai, Shuohuan Wang, Yu Sun, Hao Tian, Hua Wu and Haifeng Wang", "abstract": "Derivative-free prompt learning has emerged as a lightweight alternative to prompt tuning, which only requires model inference to optimize the prompts. However, existing work did not take full advantage of the over-parameterized characteristics of large pre-trained language models (PLMs). In this paper, we propose Clip-Tuning, a simple yet effective method that adopts diverse frozen \"thinned\" networks of PLMs to obtain *a mixture of rewards* and thus advance the derivative-free prompt learning. The thinned networks consist of all the hidden units that survive a stationary dropout strategy, whose inference predictions reflect an ensemble of partial views over prompted training samples. Our method outperforms previous gradient-free prompt learning methods and achieves parity with gradient-based counterparts on seven language understanding benchmarks under few-shot settings.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.518729209899902, 1.8739672899246216], "id": 192, "title": "Soft-Labeled Contrastive Pre-Training for Function-Level Code Representation", "authors": "Xiaonan Li, Daya Guo, Yeyun Gong, Yun Lin, Yelong Shen, Xipeng Qiu, Daxin Jiang, Weizhu Chen and Nan Duan", "abstract": "Code contrastive pre-training has recently achieved significant progress on code-related tasks. In this paper, we present \\textbf{SCodeR}, a \\textbf{S}oft-labeled contrastive pre-training framework with two positive sample construction methods to learn functional-level \\textbf{Code} \\textbf{R}epresentation. Considering the relevance between codes in a large-scale code corpus, the soft-labeled contrastive pre-training can obtain fine-grained soft-labels through an iterative adversarial manner and use them to learn better code representation. The positive sample construction is another key for contrastive pre-training. Previous works use transformation-based methods like variable renaming to generate semantically equal positive codes. However, they usually result in the generated code with a highly similar surface form, and thus mislead the model to focus on superficial code structure instead of code semantics. To encourage SCodeR to capture semantic information from the code, we utilize code comments and abstract syntax sub-trees of the code to build positive samples. We conduct experiments on four code-related tasks over seven datasets. Extensive experimental results show that SCodeR achieves new state-of-the-art performance on all of them, which illustrates the effectiveness of the proposed pre-training method.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.83168888092041, 11.847955703735352], "id": 195, "title": "Conditioned Masked Language and Image Modeling for Image-Text Dense Retrieval", "authors": "Ziyang Luo, Yadong Xi, Rongsheng Zhang, GongZheng Li, Zeng Zhao and Jing Ma", "abstract": "Image-text retrieval is a fundamental cross-modal task that takes image/text as a query to retrieve relevant data of another type. The large-scale two-stream pre-trained models like CLIP have achieved tremendous success in this area. They embed the images and texts into instance representations with two separate encoders, aligning them on the instance-level with contrastive learning. Beyond this, the following works adopt the fine-grained token-level interaction (Masked Language and Image Modeling) to boost performance further. However, the vanilla token-level objectives are not designed to aggregate the image-text alignment information into the instance representations, but the token representations, causing a gap between pre-training and application. To address this issue, we carefully design two novel conditioned token-level pre-training objectives, Conditioned Masked Language and Image Modeling (ConMLM and ConMIM), forcing models to aggregate the token-level alignment information into the instance representations. Combing with the instance-level contrastive learning, we propose our cross-modal dense retrieval framework, Conditioned Language-Image Pre-training (ConLIP). Experimental results on two popular cross-modal retrieval benchmarks (MSCOCO and Flickr30k) reveal the effectiveness of our methods.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.305691719055176, 7.725064754486084], "id": 199, "title": "Does Simultaneous Speech Translation need Simultaneous Models?", "authors": "Sara Papi, Marco Gaido, Matteo Negri and Marco Turchi", "abstract": "In simultaneous speech translation (SimulST), finding the best trade-off between high output quality and low latency is a challenging task. To meet the latency constraints posed by different application scenarios, multiple dedicated SimulST models are usually trained and maintained, generating high computational costs. \nIn this paper, also motivated by the increased sensitivity towards sustainable AI, we investigate whether a single model trained offline can serve both offline and simultaneous applications under different latency regimes without additional training or adaptation. Experiments on en->{de, es} show that, aside from facilitating the adoption of well-established offline architectures and training strategies without affecting latency, offline training achieves similar or better quality compared to the standard SimulST training protocol, also being competitive with the state-of-the-art system.", "track": "Machine Translation", "label": 10}, {"loc": [6.16766357421875, 12.596125602722168], "id": 200, "title": "Utilizing Language-Image Pretraining for Efficient and Robust Bilingual Word Alignment", "authors": "Tuan Q. Dinh, Jy-yong Sohn, Shashank Rajput, Timothy Z. Ossowski, Yifei Ming, Junjie Hu, Dimitris Papailiopoulos and Kangwook Lee", "abstract": "Word translation without parallel corpora has become feasible, rivaling the performance of supervised methods. \nRecent findings have shown the improvement in accuracy and robustness of unsupervised word translation (UWT) by utilizing visual observations, which are universal representations across languages.\nOur work investigates the potential of using not only visual observations but also pretrained language-image models for enabling a more efficient and robust UWT. \nWe develop a novel UWT method dubbed Word Alignment using Language-Image Pretraining (WALIP), leveraging visual observations via the shared image-text embedding space of CLIPs (Radford et al., 2021). \nWALIP has a two-step procedure. \nFirst, we retrieve word pairs with high confidences of similarity, computed using our proposed image-based fingerprints, which define the initial pivot for the alignment.\nSecond, we apply our robust Procrustes algorithm to estimate the linear mapping between two embedding spaces, which iteratively corrects and refines the estimated alignment.\nOur extensive experiments show that WALIP improves upon the state-of-the-art performance of bilingual word alignment for a few language pairs across different word embeddings and displays great robustness to the dissimilarity of language pairs or training corpora for two word embeddings.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.7067770957946777, 5.303996562957764], "id": 203, "title": "Grape: Knowledge Graph Enhanced Passage Reader for Open-domain Question Answering", "authors": "Mingxuan Ju, Wenhao Yu, Tong Zhao, Chuxu Zhang and Yanfang Ye", "abstract": "A common thread of open-domain question answering (QA) models employs a retriever-reader pipeline that first retrieves a handful of relevant passages from Wikipedia and then peruses the passages to produce an answer. However, even state-of-the-art readers fail to capture the complex relationships between entities appearing in questions and retrieved passages, leading to answers that contradict the facts. In light of this, we propose a novel knowledge graph enhanced passage reader, namely Grape, to improve the reader performance for open-domain QA. Specifically, for each pair of question and retrieved passage, we first construct a localized bipartite graph, attributed to entity embeddings extracted from the intermediate layer of the reader model. Then, a graph neural network learns relational knowledge while fusing graph and contextual representations into the hidden states of the reader model. Experiments on three open-domain QA benchmarks show Grape can improve the state-of-the-art performance by up to 2.2 exact match score with a negligible overhead increase, with the same retriever and retrieved passages. Our code is publicly available at https://github.com/jumxglhf/GRAPE.", "track": "Question Answering", "label": 11}, {"loc": [4.17355489730835, 9.2208833694458], "id": 216, "title": "NarraSum: A Large-Scale Dataset for Abstractive Narrative Summarization", "authors": "Chao Zhao, Faeze Brahman, Kaiqiang Song, Wenlin Yao, Dian Yu and Snigdha Chaturvedi", "abstract": "Narrative summarization aims to produce a distilled version of a narrative to describe its most salient events and characters. Writing a summary for a narrative is challenging as it requires an understanding of event causality and character behaviors. To encourage research in this direction, we propose NarraSum, a large-scale narrative summarization dataset. It contains 122K narratives, which are collected from the synopses of movies and TV episodes with diverse genres, and their corresponding abstractive summaries. Experiments show that there is a large performance gap between humans and the state-of-the-art summarization models on NarraSum. We hope that this dataset will promote future research in summarization, as well as broader studies of natural language understanding and generation. The dataset is available at https://github.com/zhaochaocs/narrasum.", "track": "Summarization", "label": 14}, {"loc": [10.94374942779541, 6.763333320617676], "id": 238, "title": "NMTScore: A Multilingual Analysis of Translation-based Text Similarity Measures", "authors": "Jannis Vamvas and Rico Sennrich", "abstract": "Being able to rank the similarity of short text segments is an interesting bonus feature of neural machine translation. Translation-based similarity measures include direct and pivot translation probability, as well as translation cross-likelihood, which has not been studied so far. We analyze these measures in the common framework of multilingual NMT, releasing the NMTScore library. Compared to baselines such as sentence embeddings, translation-based measures prove competitive in paraphrase identification and are more robust against adversarial or multilingual input, especially if proper normalization is applied. When used for reference-based evaluation of data-to-text generation in 2 tasks and 17 languages, translation-based measures show a relatively high correlation to human judgments.", "track": "Multilinguality", "label": 13}, {"loc": [6.687414646148682, 6.286711692810059], "id": 244, "title": "Language Models Understand Us, Poorly", "authors": "Jared Moore", "abstract": "Some claim language models understand us. Others won't hear it. To clarify, I investigate three views of human language understanding: as-mapping, as-reliability and as-representation. I argue that while behavioral reliability is necessary for understanding, internal representations are sufficient; they climb the right hill. I review state-of-the-art language and multi-modal models: they are pragmatically challenged by under-specification of form. I question the Scaling Paradigm: limits on resources may prohibit scaled-up models from approaching understanding. Last, I describe how as-representation advances a science of understanding. We need work which probes model internals, adds more of human language, and measures what models can learn.", "track": "Theme Track", "label": 18}, {"loc": [3.9041643142700195, 7.2275800704956055], "id": 246, "title": "Dialogue Meaning Representation for Task-Oriented Dialogue Systems", "authors": "Xiangkun Hu, Junqi Dai, Hang Yan, Yi Zhang, Qipeng Guo, Xipeng Qiu and Zheng Zhang", "abstract": "Dialogue meaning representation formulates natural language utterance semantics in their conversational context in an explicit and machine-readable form. Previous work typically follows the intent-slot framework, which is easy for annotation yet limited in scalability for complex linguistic expressions. A line of works alleviates the representation issue by introducing hierarchical structures but challenging to express complex compositional semantics, such as negation and coreference. We propose Dialogue Meaning Representation (DMR), a pliable and easily extendable representation for task-oriented dialogue. Our representation contains a set of nodes and edges to represent rich compositional semantics. Moreover, we propose an inheritance hierarchy mechanism focusing on domain extensibility. Additionally, we annotated DMR-FastFood, a multi-turn dialogue dataset with more than 70k utterances, with DMR. We propose two evaluation tasks to evaluate different dialogue models and a novel coreference resolution model GNNCoref for the graph-based coreference resolution task. Experiments show that DMR can be parsed well with pre-trained Seq2Seq models, and GNNCoref outperforms the baseline models by a large margin.The dataset and code are available at https://github.com/amazon-research/dialogue-meaning-representation", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.83725643157959, 9.356719970703125], "id": 255, "title": "Learning from the Dictionary: Heterogeneous Knowledge Guided Fine-tuning for Chinese Spell Checking", "authors": "Yinghui Li, Shirong Ma, Qingyu Zhou, Zhongli Li, Li Yangning, Shulin Huang, Ruiyang Liu, Chao Li, Yunbo Cao and Haitao Zheng", "abstract": "Chinese Spell Checking (CSC) aims to detect and correct Chinese spelling errors. Recent researches start from the pretrained knowledge of language models and take multimodal information into CSC models to improve the performance. However, they overlook the rich knowledge in the dictionary, the reference book where one can learn how one character should be pronounced, written, and used. In this paper, we propose the LEAD framework, which renders the CSC model to learn heterogeneous knowledge from the dictionary in terms of phonetics, vision, and meaning. LEAD first constructs positive and negative samples according to the knowledge of character phonetics, glyphs, and definitions in the dictionary. Then a unified contrastive learning-based training scheme is employed to refine the representations of the CSC models. Extensive experiments and detailed analyses on the SIGHAN benchmark datasets demonstrate the effectiveness of our proposed methods.", "track": "NLP Applications", "label": 0}, {"loc": [1.8112585544586182, 3.924349308013916], "id": 277, "title": "Salient Phrase Aware Dense Retrieval: Can a Dense Retriever Imitate a Sparse One?", "authors": "Xilun Chen, Kushal Lakhotia, Barlas Oguz, Anchit Gupta, Patrick Lewis, Stan Peshterliev, Yashar Mehdad, Sonal Gupta and Wen-tau Yih", "abstract": "Despite their recent popularity and well-known advantages, dense retrievers still lag behind sparse methods such as BM25 in their ability to reliably match salient phrases and rare entities in the query and to generalize to out-of-domain data. It has been argued that this is an inherent limitation of dense models. We rebut this claim by introducing the Salient Phrase Aware Retriever (SPAR), a dense retriever with the lexical matching capacity of a sparse model. We show that a dense Lexical Model \u039b can be trained to imitate a sparse one, and SPAR is built by augmenting a standard dense retriever with \u039b. Empirically, SPAR shows superior performance on a range of tasks including five question answering datasets, MS MARCO passage retrieval, as well as the EntityQuestions and BEIR benchmarks for out-of-domain evaluation, exceeding the performance of state-of-the-art dense and sparse retrievers. The code and models of SPAR are available at: https://github.com/facebookresearch/dpr-scale/tree/main/spar", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [6.044088363647461, 12.28160572052002], "id": 278, "title": "SMARTAVE: Structured Multimodal Transformer for Product Attribute Value Extraction", "authors": "Qifan Wang, Li Yang, Jingang Wang, Jitin Krishnan, Bo Dai, Sinong Wang, Zenglin Xu, Madian Khabsa and Hao Ma", "abstract": "Automatic product attribute value extraction refers to the task of identifying values of an attribute from the product information. Product attributes are essential in improving online shopping experience for customers. Most existing methods focus on extracting attribute values from product title and description.\nHowever, in many real-world applications, a product is usually represented by multiple modalities beyond title and description, such as product specifications, text and visual information from the product image, etc. In this paper, we propose \\textsc{SMARTAVE}, a \\underline{S}tructure \\underline{M}ltimodal tr\\underline{A}nsforme\\underline{R} for produc\\underline{T} \\underline{A}ttribute \\underline{V}alue \\underline{E}xtraction, which jointly encodes the structured product information from multiple modalities. Specifically, in \\textsc{SMARTAVE} encoder, we introduce hyper-tokens to represent the modality-level information, and local-tokens to represent the original text and visual inputs. Structured attention patterns are designed among the hyper-tokens and local-tokens for learning effective product representation. The attribute values are then extracted based on the learned embeddings. We conduct extensive experiments on two multimodal product datasets. Experimental results demonstrate the superior performance of the proposed approach over several state-of-the-art methods. Ablation studies validate the effectiveness of the structured attentions in modeling the multimodal product information.", "track": "NLP Applications", "label": 0}, {"loc": [6.5096116065979, 1.873337984085083], "id": 283, "title": "When Language Model Meets Private Library", "authors": "Zan Daoguang, Bei Chen, Zeqi Lin, Bei Guan, Wang Yongji and Jian-Guang LOU", "abstract": "With the rapid development of pre-training techniques, a number of language models have been pre-trained on large-scale code corpora and perform well in code generation. In this paper, we investigate how to equip pre-trained language models with the ability of code generation for private libraries. In practice, it is common for programmers to write code using private libraries. However, this is a challenge for language models since they have never seen private APIs during training. Motivated by the fact that private libraries usually come with elaborate API documentation, we propose a novel framework with two modules: the APIRetriever finds useful APIs, and then the APICoder generates code using these APIs. For APIRetriever, we present a dense retrieval system and also design a friendly interaction to involve uses. For APICoder, we can directly use off-the-shelf language models, or continually pre-train the base model on a code corpus containing API information. Both modules are trained with data from public libraries and can be generalized to private ones. Furthermore, we craft three benchmarks for private libraries, named TorchDataEval, MonkeyEval, and BeatNumEval. Experimental results demonstrate the impressive performance of our framework.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [1.0787960290908813, 10.538798332214355], "id": 285, "title": "Cross-Domain Sentiment Classification using Semantic Representation", "authors": "Shichen Li, Zhongqing Wang, Xiaotong Jiang and Guodong Zhou", "abstract": "Previous studies on cross-domain sentiment classification depend on the pivot features or utilize the target data for representation learning, which ignore the semantic relevance between different domains. To this end, we exploit Abstract Meaning Representation (AMR) to help with cross-domain sentiment classification. Compared with the textual input, AMR reduces data sparsity and explicitly provides core semantic knowledge and correlations between different domains. In particular, we develop an algorithm to construct a sentiment-driven semantic graph from sentence-level AMRs. We further design two strategies to linearize the semantic graph and propose a text-graph interaction model to fuse the text and semantic graph representations for cross-domain sentiment classification. Empirical studies show the effectiveness of our proposed model over several strong baselines. The results also indicate the importance of the proposed sentiment-driven semantic graph for cross-domain sentiment classification.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [8.229382514953613, 6.760915756225586], "id": 292, "title": "Yes-Yes-Yes: Proactive Data Collection for ACL Rolling Review and Beyond", "authors": "Nils Dycke, Ilia Kuznetsov and Iryna Gurevych", "abstract": "The shift towards publicly available text sources has enabled language processing at unprecedented scale, yet leaves under-serviced the domains where public and openly licensed data is scarce. Proactively collecting text data for research is a viable strategy to address this scarcity, but lacks systematic methodology taking into account the many ethical, legal and confidentiality-related aspects of data collection. Our work presents a case study on proactive data collection in peer review -- a challenging and under-resourced NLP domain. We outline ethical and legal desiderata for proactive data collection and introduce \"Yes-Yes-Yes\", the first donation-based peer reviewing data collection workflow that meets these requirements. We report on the implementation of Yes-Yes-Yes at ACL Rolling Review and empirically study the implications of proactive data collection for the dataset size and the biases induced by the donation behavior on the peer reviewing platform.", "track": "Ethic Concerns:Ethics", "label": 21}, {"loc": [5.748798370361328, 11.740848541259766], "id": 295, "title": "AssistSR: Task-oriented Video Segment Retrieval for Personal AI Assistant", "authors": "Weixian Lei, DIFEI GAO, Yuxuan Wang, Dongxing Mao, Zihan Liang, Lingmin Ran and Mike Zheng Shou", "abstract": "It is still a pipe dream that personal AI assistants on the phone and AR glasses can assist our daily life in addressing our questions like ``how to adjust the date for this watch?'' and ``how to set its heating duration? (while pointing at an oven)''. The queries used in conventional tasks (i.e. Video Question Answering, Video Retrieval, Moment Localization) are often factoid and based on pure text. In contrast, we present a new task called Task-oriented Question-driven Video Segment Retrieval (TQVSR). Each of our questions is an image-box-text query that focuses on affordance of items in our daily life and expects relevant answer segments to be retrieved from a corpus of instructional video-transcript segments. To support the study of this TQVSR task, we construct a new dataset called AssistSR. We design novel guidelines to create high-quality samples. This dataset contains 3.2k multimodal questions on 1.6k video segments from instructional videos on diverse daily-used items. To address TQVSR, we develop a simple yet effective model called Dual Multimodal Encoders (DME) that significantly outperforms several baseline methods while still having large room for improvement in the future. Moreover, we present detailed ablation analyses. Code and data are available at https://github.com/StanLei52/TQVSR.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.017842292785645, 3.1563735008239746], "id": 301, "title": "Dim-Krum: Backdoor-Resistant Federated Learning for NLP with Dimension-wise Krum-Based Aggregation", "authors": "Zhiyuan Zhang, Qi Su and Xu Sun", "abstract": "Despite the potential of federated learning, it is known to be vulnerable to backdoor attacks. Many robust federated aggregation methods are proposed to reduce the potential backdoor risk. However, they are mainly validated in the CV field. In this paper, we find that NLP backdoors are hard to defend against than CV, and we provide a theoretical analysis that the malicious update detection error probabilities are determined by the relative backdoor strengths. NLP attacks tend to have small relative backdoor strengths, which may result in the failure of robust federated aggregation methods for NLP attacks. Inspired by the theoretical results, we can choose some dimensions with higher backdoor strengths to settle this issue. We propose a novel federated aggregation algorithm, Dim-Krum, for NLP tasks, and experimental results validate its effectiveness.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.951749801635742, 3.2707250118255615], "id": 302, "title": "Fine-mixing: Mitigating Backdoors in Fine-tuned Language Models", "authors": "Zhiyuan Zhang, Lingjuan Lyu, Xingjun Ma, Chenguang Wang and Xu Sun", "abstract": "Deep Neural Networks (DNNs) are known to be vulnerable to backdoor attacks. In Natural Language Processing (NLP), DNNs are often backdoored during the fine-tuning process of a large-scale Pre-trained Language Model (PLM) with poisoned samples. Although the clean weights of PLMs are readily available, existing methods have ignored this information in defending NLP models against backdoor attacks. In this work, we take the first step to exploit the pre-trained (unfine-tuned) weights to mitigate backdoors in fine-tuned language models. Specifically, we leverage the clean pre-trained weights via two complementary techniques: (1) a two-step Fine-mixing technique, which first mixes the backdoored weights (fine-tuned on poisoned data) with the pre-trained weights, then fine-tunes the mixed weights on a small subset of clean data; (2) an Embedding Purification (E-PUR) technique, which mitigates potential backdoors existing in the word embeddings. We compare Fine-mixing with typical backdoor mitigation methods on three single-sentence sentiment classification tasks and two sentence-pair classification tasks and show that it outperforms the baselines by a considerable margin in all scenarios. We also show that our E-PUR method can benefit existing mitigation methods. Our work establishes a simple but strong baseline defense for secure fine-tuned NLP models against backdoor attacks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.0858659744262695, 7.013349533081055], "id": 304, "title": "Language Models that Seek for Knowledge: Modular Search & Generation for Dialogue and Prompt Completion", "authors": "Kurt Shuster, Mojtaba Komeili, Leonard Adolphs, Stephen Roller, Arthur Szlam and Jason Weston", "abstract": "Language models (LMs) have recently been shown to generate more factual responses by employing modularity (Zhou et al., 2022) in combination with retrieval (Adolphs et al., 2021). We extend the recent approach of Adolphs et al. (2021) to include internet search as a module. Our SeeKeR (Search engine->Knowledge->Response) method thus applies a single LM to three modular tasks in succession: search, generating knowledge, and generating a final response. We show that, when using SeeKeR as a dialogue model, it outperforms the state-of-the-art model BlenderBot 2 (Chen et al., 2021) on open-domain knowledge-grounded conversations for the same number of parameters, in terms of consistency, knowledge and per-turn engagingness. SeeKeR applied to topical prompt completions as a standard language model outperforms GPT2 (Radford et al., 2019) and GPT3 (Brown et al., 2020) in terms of factuality and topicality, despite GPT3 being a vastly larger model. Our code and models are made publicly available.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.962996959686279, 4.583862781524658], "id": 309, "title": "Stretching Sentence-pair NLI Models to Reason over Long Documents and Clusters", "authors": "Tal Schuster, Sihao Chen, Senaka Buthpitiya, Alex Fabrikant and Donald Metzler", "abstract": "Natural Language Inference (NLI) has been extensively studied by the NLP community as a framework for estimating the semantic relation between sentence pairs. While early work identified certain biases in NLI models, recent advancements in modeling and datasets demonstrated promising performance.\nIn this work, we further explore the direct zero-shot applicability of NLI models to real applications, beyond the sentence-pair setting they were trained on. First, we analyze the robustness of these models to longer and out-of-domain inputs. Then, we develop new aggregation methods to allow operating over full documents, reaching state-of-the-art performance on the ContractNLI dataset. Interestingly, we find NLI scores to provide strong retrieval signals, leading to more relevant evidence extractions compared to common similarity-based methods. Finally, we go further and investigate whole document clusters to identify both discrepancies and consensus among sources. In a test case, we find real inconsistencies between Wikipedia pages in different languages about the same topic.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [0.8039590120315552, 8.054597854614258], "id": 313, "title": "Towards Realistic Low-resource Relation Extraction: A Benchmark with Empirical Baseline Study", "authors": "Xin Xu, Xiang Chen, Ningyu Zhang, Xin Xie, Xi Chen and Huajun Chen", "abstract": "This paper presents an empirical study to build relation extraction systems in low-resource settings. Based upon recent pre-trained language models, we comprehensively investigate three schemes to evaluate the performance in low-resource settings: (i) different types of prompt-based methods with few-shot labeled data; (ii) diverse balancing methods to address the long-tailed distribution issue; (iii) data augmentation technologies and self-training to generate more labeled in-domain data. We create a benchmark with 8 relation extraction (RE) datasets covering different languages, domains and contexts and perform extensive comparisons over the proposed schemes with combinations. Our experiments illustrate: (i) Though prompt-based tuning is beneficial in low-resource RE, there is still much potential for improvement, especially in extracting relations from cross-sentence contexts with multiple relational triples; (ii) Balancing methods are not always helpful for RE with long-tailed distribution; (iii) Data augmentation complements existing baselines and can bring much performance gain, while self-training may not consistently achieve advancement to low-resource RE. Code and datasets are in https://github.com/zjunlp/LREBench.", "track": "Information Extraction", "label": 5}, {"loc": [10.216456413269043, 6.932660102844238], "id": 319, "title": "CLLE: A Benchmark for Continual Language Learning Evaluation in Multilingual Machine Translation", "authors": "Han Zhang, Sheng Zhang, Yang Xiang, Bin Liang, Jinsong Su, Zhongjian Miao, Hui Wang and Ruifeng Xu", "abstract": "Continual Language Learning (CLL) in multilingual translation is inevitable when new languages are required to be translated. Due to the lack of unified and generalized benchmarks, the evaluation of existing methods is greatly influenced by experimental design which usually has a big gap from the industrial demands. In this work, we propose the first Continual Language Learning Evaluation benchmark CLLE in multilingual translation. CLLE consists of a Chinese-centric corpus --- CN-25 and two CLL tasks --- the close-distance language continual learning task and the language family continual learning task designed for real and disparate demands. Different from existing translation benchmarks, CLLE considers several restrictions for CLL, including domain distribution alignment, content overlap, language diversity, and the balance of corpus. Furthermore, we propose a novel framework COMETA based on Constrained Optimization and META-learning to alleviate catastrophic forgetting and dependency on history training data by using a meta-model to retain the important parameters for old languages. Our experiments prove that CLLE is a challenging CLL benchmark and that our proposed method is effective when compared with other strong baselines. Due to the construction of the corpus, the task designing and the evaluation method are independent of the centric language, we also construct and release the English-centric corpus EN-25 to facilitate academic research.", "track": "Machine Translation", "label": 10}, {"loc": [2.0562775135040283, 4.04571008682251], "id": 323, "title": "Lexicon-Enhanced Self-Supervised Training for Multilingual Dense Retrieval", "authors": "Houxing Ren, Linjun Shou, Jian Pei, Ning Wu, Ming Gong and Daxin Jiang", "abstract": "Recent multilingual pre-trained models have shown better performance in various multilingual tasks. However, these models perform poorly on multilingual retrieval tasks due to lacking multilingual training data. In this paper, we propose to mine and generate self-supervised training data based on a large-scale unlabeled corpus. We carefully design a mining method which combines the sparse and dense models to mine the relevance of unlabeled queries and passages. And we introduce a query generator to generate more queries in target languages for unlabeled passages. Through extensive experiments on Mr. TYDI dataset and an industrial dataset from a commercial search engine, we demonstrate that our method performs better than baselines based on various pre-trained multilingual models. Our method even achieves on-par performance with the supervised method on the latter dataset.", "track": "Multilinguality", "label": 13}, {"loc": [5.480278968811035, 5.108109951019287], "id": 333, "title": "Improve Interpretability of Neural Networks via Sparse Contrastive Coding", "authors": "Junhong Liu, Yijie Lin, Liang Jiang, Jia Liu, Zujie Wen and Xi Peng", "abstract": "Although explainable artificial intelligence (XAI) has achieved remarkable developments in recent years, there are few efforts have been devoted to the following problems, namely, i) how to develop an explainable method that could explain the black-box in a model-agnostic way? and ii) how to improve the performance and interpretability of the black-box using such explanations instead of pre-collected important attributions? To explore the potential solution, we propose a model-agnostic explanation method termed as Sparse Contrastive Coding (SCC) and verify its effectiveness in text classification and natural language inference. In brief, SCC explains the feature attributions which characterize the importance of words based on the hidden states of each layer of the model. With such word-level explainability, SCC adaptively divides the input sentences into foregrounds and backgrounds in terms of task relevance. Through maximizing the similarity between the foregrounds and input sentences while minimizing the similarity between the backgrounds and input sentences, SSC employs a supervised contrastive learning loss to boost the interpretability and performance of the model. Extensive experiments show the superiority of our method over five state-of-the-art methods in terms of interpretability and classification measurements. The code is available at https://pengxi.me.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.712814807891846, 10.663534164428711], "id": 334, "title": "LEMON: Language-Based Environment Manipulation via Execution-Guided Pre-training", "authors": "Qi Shi, Qian Liu, Bei Chen, Yu Zhang, Ting Liu and Jian-Guang LOU", "abstract": "Language-based environment manipulation requires agents to manipulate the environment following natural language instructions, which is challenging due to the huge space of the environments.\nTo address this challenge, various approaches have been proposed in recent work. Although these approaches work well for their intended environments, they are difficult to generalize across environments. In this work, we propose LEMON, a general framework for language-based environment manipulation tasks. Specifically, we first specify a general approach for language-based environment manipulation tasks, which can deal with various environments using the same generative language model. Then we propose an execution-guided pre-training strategy to inject prior knowledge of environments to the language model with a pure synthetic pre-training corpus. Experimental results on tasks including Alchemy, Scene, Tangrams, ProPara and Recipes demonstrate the effectiveness of LEMON: it achieves new state-of-the-art results on four of the tasks, and the execution-guided pre-training strategy brings remarkable improvements on all experimental tasks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.738396406173706, 9.108654022216797], "id": 335, "title": "CROP: Zero-shot Cross-lingual Named Entity Recognition with Multilingual Labeled Sequence Translation", "authors": "Jian Yang, Shaohan Huang, Shuming Ma, Yuwei Yin, Li Dong, Dongdong Zhang, hongcheng guo, Zhoujun Li and Furu Wei", "abstract": "Named entity recognition (NER) suffers from the scarcity of annotated training data, especially for low-resource languages without labeled data. Cross-lingual NER has been proposed to alleviate this issue by transferring knowledge from high-resource languages to low-resource languages via aligned cross-lingual representations or machine translation results. However, the performance of cross-lingual NER methods is severely affected by the unsatisfactory quality of translation or label projection. To address these problems, we propose a Cross-lingual Entity Projection framework (CROP) to enable zero-shot cross-lingual NER with the help of a multilingual labeled sequence translation model. Specifically, the target sequence is first translated into the source language and then tagged by a source NER model. We further adopt a labeled sequence translation model to project the tagged sequence back to the target language and label the target raw sentence. Ultimately, the whole pipeline is integrated into an end-to-end model by the way of self-training. Experimental results on two benchmarks demonstrate that our method substantially outperforms the previous strong baseline by a large margin of +3~7 F1 scores and achieves state-of-the-art performance.", "track": "Multilinguality", "label": 13}, {"loc": [7.609042167663574, 12.30031681060791], "id": 339, "title": "Handling and Presenting Harmful Text in NLP Research", "authors": "Hannah Rose Kirk, Abeba Birhane, Bertie Vidgen and Leon Derczynski", "abstract": "Text data can pose a risk of harm. However, the risks are not fully understood, and how to handle, present, and discuss harmful text in a safe way remains an unresolved issue in the NLP community. We provide an analytical framework categorising harms on three axes: (1) the harm type (e.g., misinformation, hate speech or racial stereotypes); (2) whether a harm is sought as a feature of the research design if explicitly studying harmful content (e.g., training a hate speech classifier), versus unsought if harmful content is encountered when working on unrelated problems (e.g., language generation or part-of-speech tagging); and (3) who it affects, from people (mis)represented in the data to those handling the data and those publishing on the data. We provide advice for practitioners, with concrete steps for mitigating harm in research and in publication. To assist implementation we introduce HarmCheck -- a documentation standard for handling and presenting harmful text in research.", "track": "Theme Track", "label": 18}, {"loc": [6.437847137451172, 12.105864524841309], "id": 356, "title": "Multimodal Contrastive Learning via Uni-Modal Coding and Cross-Modal Prediction for Multimodal Sentiment Analysis", "authors": "Ronghao Lin and Haifeng Hu", "abstract": "Multimodal representation learning is a challenging task in which previous work mostly focus on either uni-modality pre-training or cross-modality fusion. In fact, we regard modeling multimodal representation as building a skyscraper, where laying stable foundation and designing the main structure are equally essential. The former is like encoding robust uni-modal representation while the later is like integrating interactive information among different modalities, both of which are critical to learning an effective multimodal representation. Recently, contrastive learning has been successfully applied in representation learning, which can be utilized as the pillar of the skyscraper and benefit the model to extract the most important features contained in the multimodal data. In this paper, we propose a novel framework named MultiModal Contrastive Learning (MMCL) for multimodal representation to capture intra- and inter-modality dynamics simultaneously. Specifically, we devise uni-modal contrastive coding with an efficient uni-modal feature augmentation strategy to filter inherent noise contained in acoustic and visual modality and acquire more robust uni-modality representations. Besides, a pseudo siamese network is presented to predict representation across different modalities, which successfully captures cross-modal dynamics. Moreover, we design two contrastive learning tasks, instance- and sentiment-based contrastive learning, to promote the process of prediction and learn more interactive information related to sentiment. Extensive experiments conducted on two public datasets demonstrate that our method surpasses the state-of-the-art methods.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.004697799682617, 9.306853294372559], "id": 359, "title": "Towards Unified Prompt Tuning for Few-shot Text Classification", "authors": "Jianing Wang, Chengyu Wang, Fuli Luo, Chuanqi Tan, Minghui Qiu, Fei Yang, Qiuhui Shi, Songfang Huang and Ming Gao", "abstract": "Prompt-based fine-tuning has boosted the performance of Pre-trained Language Models (PLMs) on few-shot text classification by employing task-specific prompts. Yet, PLMs are unfamiliar with prompt-style expressions during pre-training, which limits the few-shot learning performance on downstream tasks.\nIt would be desirable if the models can acquire some prompting knowledge before adapting to specific NLP tasks. We present the Unified Prompt Tuning (UPT) framework, leading to better few-shot text classification for BERT-style models by explicitly capturing prompting semantics from non-target NLP datasets. In UPT, a novel paradigm Prompt-Options-Verbalizer is proposed for joint prompt learning across different NLP tasks, forcing PLMs to capture task-invariant prompting knowledge. We further design a self-supervised task named Knowledge-enhanced Selective Masked Language Modeling to improve the PLM's generalization abilities for accurate adaptation to previously unseen tasks. After multi-task learning across multiple tasks, the PLM can be better prompt-tuned towards any dissimilar target tasks in low-resourced settings. Experiments over a variety of NLP tasks show that UPT consistently outperforms state-of-the-arts for prompt-based fine-tuning.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [5.389382362365723, 5.044684410095215], "id": 367, "title": "Can language models learn from explanations in context?", "authors": "Andrew Kyle Lampinen, Ishita Dasgupta, Stephanie C.Y. Chan, Kory Mathewson, MH Tessler, Antonia Creswell, James L. McClelland, Jane X. Wang and Felix Hill", "abstract": "Language Models (LMs) can perform new tasks by adapting to a few in-context examples. For humans, explanations that connect examples to task principles can improve learning. We therefore investigate whether explanations of few-shot examples can help LMs. We annotate questions from 40 challenging tasks with answer explanations, and various matched control explanations. We evaluate how different types of explanations, instructions, and controls affect zero- and few-shot performance. We analyze these results using statistical multilevel modeling techniques that account for the nested dependencies among conditions, tasks, prompts, and models. We find that explanations can improve performance\u2014even without tuning. Furthermore, explanations hand-tuned for performance on a small validation set offer substantially larger benefits, and building a prompt by selecting examples and explanations together substantially improves performance over selecting examples alone. Finally, even untuned explanations outperform carefully matched controls, suggesting that the benefits are due to the link between an example and its explanation, rather than lower-level features. However, only large models benefit. In summary, explanations can support the in-context learning of large LMs on challenging tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [1.8272124528884888, 4.037896156311035], "id": 368, "title": "GNN-encoder: Learning a Dual-encoder Architecture via Graph Neural Networks for Dense Passage Retrieval", "authors": "Jiduan Liu, Jiahao Liu, Yang Yang, Jingang Wang, Wei Wu, Dongyan Zhao and Rui Yan", "abstract": "Recently, retrieval models based on dense representations are dominant in passage retrieval tasks, due to their outstanding ability in terms of capturing semantics of input text compared to the traditional sparse vector space models. A common practice of dense retrieval models is to exploit a dual-encoder architecture to represent a query and a passage independently. Though efficient, such a structure loses interaction between the query-passage pair, resulting in inferior accuracy. To enhance the performance of dense retrieval models without loss of efficiency, we propose a GNN-encoder model in which query (passage) information is fused into passage (query) representations via graph neural networks that are constructed by queries and their top retrieved passages. By this means, we maintain a dual-encoder structure, and retain some interaction information between query-passage pairs in their representations, which enables us to achieve both efficiency and efficacy in passage retrieval. Evaluation results indicate that our method significantly outperforms the existing models on MSMARCO, Natural Questions and TriviaQA datasets, and achieves the new state-of-the-art on these datasets.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [10.829519271850586, 9.34799575805664], "id": 384, "title": "Linguistic Rules-Based Corpus Generation for Native Chinese Grammatical Error Correction", "authors": "Shirong Ma, Yinghui Li, Rongyi Sun, Qingyu Zhou, Shulin Huang, Ding Zhang, Li Yangning, Ruiyang Liu, Zhongli Li, Yunbo Cao, Haitao Zheng and Ying Shen", "abstract": "Chinese Grammatical Error Correction (CGEC) is both a challenging NLP task and a common application in human daily life. Recently, many data-driven approaches are proposed for the development of CGEC research. However, there are two major limitations in the CGEC field: First, the lack of high-quality annotated training corpora prevents the performance of existing CGEC models from being significantly improved. Second, the grammatical errors in widely used test sets are not made by native Chinese speakers, resulting in a significant gap between the CGEC models and the real application. In this paper, we propose a linguistic rules-based approach to construct large-scale CGEC training corpora with automatically generated grammatical errors. Additionally, we present a challenging CGEC benchmark derived entirely from errors made by native Chinese speakers in real-world scenarios. Extensive experiments and detailed analyses not only demonstrate that the training data constructed by our method effectively improves the performance of CGEC models, but also reflect that our benchmark is an excellent resource for further development of the CGEC field.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.562291145324707, 11.69875717163086], "id": 407, "title": "Rethinking the Video Sampling and Reasoning Strategies for Temporal Sentence Grounding", "authors": "Jiahao Zhu, Daizong Liu, Pan Zhou, Xing Di, Yu Cheng, Song Yang, Wenzheng Xu, Zichuan Xu, Yao Wan, Lichao Sun and Zeyu Xiong", "abstract": "Temporal sentence grounding (TSG) aims to identify the temporal boundary of a specific segment from an untrimmed video by a sentence query. All existing works first utilize a sparse sampling strategy to extract a fixed number of video frames and then interact them with query for reasoning.\nHowever, we argue that these methods have overlooked two indispensable issues:\n1) Boundary-bias: The annotated target segment generally refers to two specific frames as corresponding start and end timestamps. The video downsampling process may lose these two frames and take the adjacent irrelevant frames as new boundaries.\n2) Reasoning-bias: Such incorrect new boundary frames also lead to the reasoning bias during frame-query interaction, reducing the generalization ability of model.\n\nTo alleviate above limitations, in this paper, we propose a novel Siamese Sampling and Reasoning Network (SSRN) for TSG, which introduces a siamese sampling mechanism to generate additional contextual frames to enrich and refine the new boundaries. Specifically, a reasoning strategy is developed to learn the inter-relationship among these frames and generate soft labels on boundaries for more accurate frame-query reasoning. Such mechanism is also able to supplement the absent consecutive visual semantics to the sampled sparse frames for fine-grained activity understanding.\nExtensive experiments demonstrate the effectiveness of SSRN on three challenging datasets.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.5130696296691895, 4.3023576736450195], "id": 412, "title": "System 1 + System 2 = Better World: Neural-Symbolic Chain of Logic Reasoning", "authors": "Wenyue Hua and Yongfeng Zhang", "abstract": "Logical reasoning is a challenge for many current NLP neural network models since it requires more than the ability of learning informative representations from data. Inspired by the Dual Process Theory in cognitive science \u2014 which proposes that human cognition process involves two stages: an intuitive, unconscious and fast process relying on perception calledSystem 1, and a logical, conscious and slow process performing complex reasoning called System 2 \u2014 we leverage neural logic reasoning (System 2) on top of the representation learning models (System 1), which conducts explicit neural-based differentiable logical reasoning on top of the representations learned by the base neural models. Based on experiments on the commonsense knowledge graph completion task, we show that the two-system architecture always improves from its System 1 model alone. Experiments also show that both the rule-driven logical regularizer and the data-driven value regularizer are important and the performance improvement is marginal without the two regularizers, which indicates that learning from both logical prior and training data is important for reasoning tasks.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [8.047311782836914, 3.1173949241638184], "id": 413, "title": "Efficient Federated Learning on Knowledge Graphs via Privacy-preserving Relation Embedding Aggregation", "authors": "Kai Zhang, Yu Wang, Hongyi Wang, Lifu Huang, Carl Yang, Xun Chen and Lichao Sun", "abstract": "Federated learning (FL) can be essential in knowledge representation, reasoning, and data mining applications over multi-source knowledge graphs (KGs). A recent study FedE first proposes an FL framework that shares entity embeddings of KGs across all clients. However, entity embedding sharing from FedE would incur a severe privacy leakage. Specifically, the known entity embedding can be used to infer whether a specific relation between two entities exists in a private client. In this paper, we introduce a novel attack method that aims to recover the original data based on the embedding information, which is further used to evaluate the vulnerabilities of FedE. Furthermore, we propose a Federated learning paradigm with privacy-preserving Relation embedding aggregation (FedR) to tackle the privacy issue in FedE. Besides, relation embedding sharing can significantly reduce the communication cost due to its smaller size of queries. We conduct extensive experiments to evaluate FedR with five different KG embedding models and three datasets. Compared to FedE, FedR achieves similar utility and significant improvements regarding privacy-preserving effect and communication efficiency on the link prediction task.", "track": "Information Extraction", "label": 5}, {"loc": [7.637033462524414, 3.627551794052124], "id": 418, "title": "TextHacker: Learning based Hybrid Local Search Algorithm for Text Hard-label Adversarial Attack", "authors": "Zhen Yu, Xiaosen Wang, Wanxiang Che and Kun He", "abstract": "Existing textual adversarial attacks usually utilize the gradient or prediction confidence to generate adversarial examples, making it hard to be deployed in real-world applications. To this end, we consider a rarely investigated but more rigorous setting, namely hard-label attack, in which the attacker can only access the prediction label. In particular, we find we can learn the importance of different words via the change on prediction label caused by word substitutions on the adversarial examples. Based on this observation, we propose a novel adversarial attack, termed Text Hard-label attacker (TextHacker). TextHacker randomly perturbs lots of words to craft an adversarial example. Then, TextHacker adopts a hybrid local search algorithm with the estimation of word importance from the attack history to minimize the adversarial perturbation. Extensive evaluations for text classification and textual entailment show that TextHacker significantly outperforms existing hard-label attacks regarding the attack performance as well as adversary quality.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.4170331954956055, 12.334752082824707], "id": 420, "title": "Visualizing the Obvious: A Concreteness-based Ensemble Model for Noun Property Prediction", "authors": "Yue Yang, Artemis Panagopoulou, Marianna Apidianaki, Mark Yatskar and Chris Callison-Burch", "abstract": "Neural language models encode rich knowledge about entities and their relationships which can be extracted from their representations using probing. Common properties of nouns (e.g., red strawberries, small ant) are, however, more challenging to extract compared to other types of knowledge because they are rarely explicitly stated in texts.\nWe hypothesize this to mainly be the case for perceptual properties which are obvious to the participants in the communication. We propose to extract these properties from images and use them in an ensemble model, in order to complement the information that is extracted from language models. We consider perceptual properties to be more concrete than abstract properties (e.g., interesting, flawless). We propose to use the adjectives' concreteness score as a lever to calibrate the contribution of each source (text vs. images). We evaluate our ensemble model in a ranking task where the actual properties of a noun need to be ranked higher than other non-relevant properties. Our results show that the proposed combination of text and images greatly improves noun property prediction compared to powerful text-based language models.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.8586297035217285, 3.5495619773864746], "id": 428, "title": "It's Better to Teach Fishing than Giving a Fish: An Auto-Augmented Structure-aware Generative Model for Metaphor Detection", "authors": "Huawen Feng and Qianli Ma", "abstract": "Metaphor Detection aims to identify the metaphorical meaning of words in the sentence. Most existing work is discriminant models, which use the contextual semantic information extracted by transformers for classifications directly. Due to insufficient training data and corresponding paraphrases, recent methods focus on how to get external resources and utilize them to introduce more knowledge. Currently, contextual modeling and external data are two key issues in the field. In this paper, we propose **A**n **A**uto-**A**ugmented **S**tructure-aware generative model (**AAAS**) for metaphor detection, which transforms the classification task into a keywords-extraction task. Specifically, we propose the task of structure information extraction to allow the model to use the 'structural language' to describe the whole sentence. Furthermore, without any other external resources, we design a simple but effective auto-augmented method to expand the limited datasets. Experimental results show that **AAAS** obtains competitive results compared with state-of-the-art methods.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.902998924255371, 3.305649757385254], "id": 441, "title": "Expose Backdoors on the Way: A Feature-Based Efficient Defense against Textual Backdoor Attacks", "authors": "Sishuo Chen, Wenkai Yang, Zhiyuan Zhang, Xiaohan Bi and Xu Sun", "abstract": "Natural language processing (NLP) models are known to be vulnerable to backdoor attacks, which poses a newly arisen threat to NLP models. Prior online backdoor defense methods for NLP models only focus on the anomalies at either the input or output level, still suffering from fragility to adaptive attacks and high computational cost. In this work, we take the first step to investigate the unconcealment of textual poisoned samples at the intermediate-feature level and propose a feature-based efficient online defense method. Through extensive experiments on existing attacking methods, we find that the poisoned samples are far away from clean samples in the intermediate feature space of a poisoned NLP model. Motivated by this observation, we devise a distance-based anomaly score (DAN) to distinguish poisoned samples from clean samples at the feature level. Experiments on sentiment analysis and offense detection tasks demonstrate the superiority of DAN, as it substantially surpasses existing online defense methods in terms of defending performance and enjoys lower inference costs. Moreover, we show that DAN is also resistant to adaptive attacks based on feature-level regularization. Our code is available at https://github.com/lancopku/DAN.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.072072505950928, 6.979447841644287], "id": 443, "title": "Diving Deep into Modes of Fact Hallucinations in Dialogue Systems", "authors": "Souvik Das, Sougata Saha and Rohini Srihari", "abstract": "Knowledge Graph(KG) grounded conversations often use large pre-trained models and usually suffer from fact hallucination. Frequently entities with no references in knowledge sources and conversation history are introduced into responses, thus hindering the flow of the conversation\u2014existing work attempt to overcome this issue by tweaking the training procedure or using a multi-step refining method. However, minimal effort is put into constructing an entity-level hallucination detection system, which would provide fine-grained signals that control fallacious content while generating responses. As a first step to address this issue, we dive deep to identify various modes of hallucination in KG-grounded chatbots through human feedback analysis. Secondly, we propose a series of perturbation strategies to create a synthetic dataset named FADE (FActual Dialogue Hallucination DEtection Dataset). Finally, we conduct comprehensive data analyses and create multiple baseline models for hallucination detection to compare against human-verified data and already established benchmarks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.092928409576416, 9.640813827514648], "id": 447, "title": "Representation Learning for Resource-Constrained Keyphrase Generation", "authors": "Di Wu, Wasi U. Ahmad, Sunipa Dev and Kai-Wei Chang", "abstract": "State-of-the-art keyphrase generation methods generally depend on large annotated datasets, limiting their performance in domains with limited annotated data. To overcome this challenge, we design a data-oriented approach that first identifies salient information using retrieval-based corpus-level statistics, and then learns a task-specific intermediate representation based on a pre-trained language model using large-scale unlabeled documents. We introduce salient span recovery and salient span prediction as denoising training objectives that condense the intra-article and inter-article knowledge essential for keyphrase generation. Through experiments on multiple keyphrase generation benchmarks, we show the effectiveness of the proposed approach for facilitating low-resource keyphrase generation and zero-shot domain adaptation. Our method especially benefits the generation of absent keyphrases, approaching the performance of models trained with large training sets.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.81849479675293, 6.171376705169678], "id": 451, "title": "Systematicity in GPT-3's Interpretation of Novel English Noun Compounds", "authors": "Siyan Li, Riley Carlson and Christopher Potts", "abstract": "Levin et al. (2019) show experimentally that the interpretations of novel English noun compounds (e.g., stew skillet), while not fully compositional, are highly predictable based on whether the modifier and head refer to artifacts or natural kinds. Is the large language model GPT-3 governed by the same interpretive principles? To address this question, we first compare Levin et al.'s experimental data with GPT-3 generations, finding a high degree of similarity. However, this evidence is consistent with GPT-3 reasoning only about specific lexical items rather than the more abstract conceptual categories of Levin et al.'s theory. To probe more deeply, we construct prompts that require the relevant kind of conceptual reasoning. Here, we fail to find convincing evidence that GPT-3 is reasoning about more than just individual lexical items. These results highlight the importance of controlling for low-level distributional regularities when assessing whether a large language model latently encodes a deeper theory.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [4.635993957519531, 6.975573539733887], "id": 460, "title": "CARE: Causality Reasoning for Empathetic Responses by Conditional Graph Generation", "authors": "Jiashuo WANG, Yi Cheng and Wenjie Li", "abstract": "Recent approaches to empathetic response generation incorporate emotion causalities to enhance comprehension of both the user's feelings and experiences. However, these approaches suffer from two critical issues. First, they only consider causalities between the user's emotion and the user's experiences, and ignore those between the user's experiences. Second, they neglect interdependence among causalities and reason them independently. To solve the above problems, we expect to reason all plausible causalities interdependently and simultaneously, given the user's emotion, dialogue history, and future dialogue content. Then, we infuse these causalities into response generation for empathetic responses. Specifically, we design a new model, i.e., the Conditional Variational Graph Auto-Encoder (CVGAE), for the causality reasoning, and adopt a multi-source attention mechanism in the decoder for the causality infusion. We name the whole framework as CARE, abbreviated for CAusality Reasoning for Empathetic conversation. Experimental results indicate that our method achieves state-of-the-art performance.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.7300877571105957, 9.157980918884277], "id": 469, "title": "TransAdv: A Translation-based Adversarial Learning Framework for Zero-Resource Cross-Lingual Named Entity Recognition", "authors": "Yichun Zhao, Jintao Du, Gongshen Liu and Huijia Zhu", "abstract": "Zero-Resource Cross-Lingual Named Entity Recognition aims at training an NER model of the target language using only labeled source language data and unlabeled target language data. Existing methods are mainly divided into three categories: model transfer based, data transfer based and knowledge transfer based. Each method has its own disadvantages, and combining more than one of them often leads to better performance. However, the performance of data transfer based methods is often limited by inevitable noise in the translation process. To handle the problem, we propose a framework named TransAdv to mitigate lexical and syntactic errors of word-by-word translated data, better utilizing the data by multi-level adversarial learning and multi-model knowledge distillation. Extensive experiments are conducted over 6 target languages with English as the source language, and the results show that TransAdv achieves competitive performance to the state-of-the-art models.", "track": "Multilinguality", "label": 13}, {"loc": [5.479987144470215, 7.503069877624512], "id": 496, "title": "BARLE: Background-Aware Representation Learning for Background Shift Out-of-Distribution Detection", "authors": "Hanyu Duan, Yi Yang, Ahmed Abbasi and Kar Yan Tam", "abstract": "Machine learning models often suffer from a performance drop when they are applied to out-of-distribution (OOD) samples, i.e., those drawn far away from the training data distribution. Existing OOD detection work mostly focuses on identifying semantic-shift OOD samples, e.g., instances from unseen new classes. However, background-shift OOD detection, which identifies samples with domain or style-change, represents a more practical yet challenging task. In this paper, we propose Background-Aware Representation Learning (BARLE) for background-shift OOD detection in NLP. Specifically, we generate semantics-preserving background-shifted pseudo OOD samples from pretrained masked language models. We then contrast the in-distribution (ID) samples with their pseudo OOD counterparts. Unlike prior semantic-shift OOD detection work that often leverages an external text corpus, BARLE only uses ID data, which is more flexible and cost-efficient. In experiments across several text classification tasks, we demonstrate that BARLE is capable of improving background-shift OOD detection performance while maintaining ID classification accuracy. We further investigate the properties of the generated pseudo OOD samples, uncovering the working mechanism of BARLE.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.97835922241211, 8.205751419067383], "id": 507, "title": "What Language Model to Train if You Have One Million GPU Hours?", "authors": "Teven Le Scao, Thomas Wang, Daniel Hesslow, Stas Bekman, M Saiful Bari, Stella Biderman, Hady Elsahar, Niklas Muennighoff, Jason Phang, Ofir Press, Colin Raffel, Victor Sanh, Sheng Shen, Lintang Sutawika, Jaesung Tae, Zheng Xin Yong, Julien Launay and Iz Beltagy", "abstract": "The crystallization of modeling methods around the Transformer architecture has been a boon for practitioners. \nSimple, well-motivated architectural variations can transfer across tasks and scale, increasing the impact of modeling research. \nHowever, with the emergence of state-of-the-art 100B+ parameters models, large language models are increasingly expensive to accurately design and train. \nNotably, it can be difficult to evaluate how modeling decisions may impact emergent capabilities, given that these capabilities arise mainly from sheer scale.\nTargeting a multilingual language model in the 100B+ parameters scale, our goal is to identify an architecture and training setup that makes the best use of our 1,000,000 A100-GPU-hours budget.\nSpecifically, we perform an ablation study at the billion-parameter scale comparing different modeling practices and their impact on zero-shot generalization.\nIn addition, we study the impact of various popular pre-training corpora on zero-shot generalization. \nWe also study the performance of a multilingual model and how it compares to the English-only one. \nFinally, we consider the scaling behaviour of Transformers to choose the target model size, shape, and training setup. All our models and code are open-sourced at \\url{https://github.com/anonymous}.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.441365718841553, 7.505604267120361], "id": 511, "title": "Enhancing Out-of-Distribution Detection in Natural Language Understanding via Implicit Layer Ensemble", "authors": "Hyunsoo Cho, Choonghyun Park, Jaewook Kang, Kang Min Yoo, Taeuk Kim and Sang-goo Lee", "abstract": "Out-of-distribution (OOD) detection aims to discern outliers from the intended data distribution, which is crucial to maintaining high reliability and a good user experience.\nMost recent studies in OOD detection utilize the information from a single representation that resides in the penultimate layer to determine whether the input is anomalous or not.\nAlthough such a method is straightforward, the potential of diverse information in the intermediate layers is overlooked.\nIn this paper, we propose a novel framework based on contrastive learning that encourages intermediate features to learn layer-specialized representations and assembles them implicitly into a single representation to absorb rich information in the pre-trained language model. \nExtensive experiments in various intent classification and OOD datasets demonstrate that our approach is significantly more effective than other works.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.589073657989502, 9.04870891571045], "id": 521, "title": "Contrastive Demonstration Tuning for Pre-trained Language Models", "authors": "Xiaozhuan Liang, Ningyu Zhang, Siyuan Cheng, Zhenru Zhang, Chuanqi Tan and Huajun Chen", "abstract": "Pretrained language models can be effectively stimulated by textual prompts or demonstrations, especially in low-data scenarios. Recent works have focused on automatically searching discrete or continuous prompts or optimized verbalizers, yet studies for the demonstration are still limited. Concretely, the demonstration examples are crucial for an excellent final performance of prompt-tuning. In this paper, we propose a novel pluggable, extensible, and efficient approach named contrastive demonstration tuning, which is free of demonstration sampling. Furthermore, the proposed approach can be: (i) Plugged into any previous prompt-tuning approaches; (ii) Extended to widespread classification tasks with a large number of categories. Experimental results on 16 datasets illustrate that our method integrated with previous approaches LM-BFF and P-tuning can yield better performance. Code is available in https://github.com/zjunlp/PromptKG/tree/main/research/Demo-Tuning.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.460696697235107, 1.9188812971115112], "id": 525, "title": "Detect-Localize-Repair: A Unified Framework for Learning to Debug with CodeT5", "authors": "Nghi D. Q. Bui, Yue Wang and Steven C.H. Hoi", "abstract": "Automated software debugging is a crucial task for improving the productivity of software developers. Many neural-based techniques have been proven effective for debugging-related tasks such as bug localization and program repair (or bug fixing). However, these techniques often focus only on either one of them or approach them in a stage-wise manner, ignoring the mutual benefits between them. In this work, we propose a novel unified Detect-Localize-Repair framework based on a pretrained programming language model CodeT5 to seamlessly address these tasks, named CodeT5-DLR. Specifically, we propose three objectives to adapt the generic CodeT5 for debugging: a bug detection objective to determine whether a given code snippet is buggy or not, a bug localization objective to identify the buggy lines, and a program repair objective to translate the buggy code to its fixed version. \nWe evaluate it on each of these tasks and their combined setting on two newly collected line-level debugging datasets in Java and Python. Extensive results show that our model significantly outperforms existing baselines from both NLP and software engineering domains.", "track": "NLP Applications", "label": 0}, {"loc": [8.380699157714844, 6.617035388946533], "id": 539, "title": "Influence Functions for Sequence Tagging Models", "authors": "Sarthak Jain, Varun Manjunatha, Byron C. Wallace and Ani Nenkova", "abstract": "Many standard tasks in NLP (e.g., Named Entity Recognition, Part-of-Speech tagging, and Semantic Role Labeling) are naturally framed as sequence tagging problems. However, there has been comparatively little work on interpretability methods for sequence tagging models. In this paper, we extend influence functions --- which aim to trace predictions back to the training points that informed them --- to sequence tagging tasks. We define the influence of a training instance segment as the effect that perturbing the labels within this segment has on a test segment level prediction. We provide an efficient approximation to compute this, and show that it tracks with the \"true\" segment influence (measured empirically). We show the practical utility of segment influence by using the method to identify noisy annotations in NER corpora.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.401266574859619, 3.7153160572052], "id": 540, "title": "Impact of Pretraining Term Frequencies on Few-Shot Numerical Reasoning", "authors": "Yasaman Razeghi, Robert L Logan IV, Matt Gardner and Sameer Singh", "abstract": "Pretrained Language Models (LMs) have demonstrated ability to perform numerical reasoning by extrapolating from a few examples in few-shot settings. However, the extent to which this extrapolation relies on robust reasoning is unclear. In this paper, we investigate how well these models reason with terms that are less frequent in the pretraining data. In particular, we examine the correlations between the model performance on test instances and the frequency of terms from those instances in the pretraining data. We measure the strength of this correlation for a number of GPT-based language models (pretrained on the Pile dataset) on various numerical deduction tasks (e.g., arithmetic and unit conversion). Our results consistently demonstrate that models are more accurate on instances whose terms are more prevalent, in some cases above 70% (absolute) more accurate on the top 10\\% frequent terms in comparison to the bottom 10%. Overall, although LMs appear successful at few-shot numerical reasoning, our results raise the question of how much models actually generalize beyond pretraining data, and we encourage researchers to take the pretraining data into account when interpreting evaluation results.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.854273557662964, 6.5489277839660645], "id": 551, "title": "Syntactic and Semantic Uniformity for Semantic Parsing and Task-Oriented Dialogue Systems", "authors": "Bowen Chen and Yusuke Miyao", "abstract": "This paper proposes a data representation framework for semantic parsing and task-oriented dialogue systems, aiming to achieve a uniform representation for syntactically and semantically diverse machine-readable formats.\nCurrent NLP systems heavily rely on adapting pre-trained language models to specific tasks, and this approach has been proven effective for modeling natural language texts.\nHowever, little attention has been paid to the representation of machine-readable formats, such as database queries and dialogue states.\nWe present a method for converting original machine-readable formats of semantic parsing and task-oriented dialogue datasets into a syntactically and semantically uniform representation.\nWe define a meta grammar for syntactically uniform representations and translate semantically equivalent functions into a uniform vocabulary.\nEmpirical experiments on 13 datasets show that accuracy consistently improves over original formats, revealing the advantage of the proposed representation.\nAdditionally, we show that the proposed representation allows for transfer learning across datasets.", "track": "Resources and Evaluation", "label": 1}, {"loc": [1.919975757598877, 8.669801712036133], "id": 556, "title": "Knowledge-Rich Self-Supervision for Biomedical Entity Linking", "authors": "Sheng Zhang, Hao Cheng, Shikhar Vashishth, Cliff R. Wong, Jinfeng Xiao, Xiaodong Liu, Tristan Naumann, Jianfeng Gao and Hoifung Poon", "abstract": "Entity linking faces significant challenges such as prolific variations and prevalent ambiguities, especially in high-value domains with myriad entities. Standard classification approaches suffer from the annotation bottleneck and cannot effectively handle unseen entities. Zero-shot entity linking has emerged as a promising direction for generalizing to new entities, but it still requires example gold entity mentions during training and canonical descriptions for all entities, both of which are rarely available outside of Wikipedia. In this paper, we explore Knowledge-RIch Self-Supervision (KRISS) for biomedical entity linking, by leveraging readily available domain knowledge. In training, it generates self-supervised mention examples on unlabeled text using a domain ontology and trains a contextual encoder using contrastive learning. For inference, it samples self-supervised mentions as prototypes for each entity and conducts linking by mapping the test mention to the most similar prototype. Our approach can easily incorporate entity descriptions and gold mention labels if available. We conducted extensive experiments on seven standard datasets spanning biomedical literature and clinical notes. Without using any labeled information, our method produces KRISSBERT, a universal entity linker for four million UMLS entities that attains new state of the art, outperforming prior self-supervised methods by as much as 20 absolute points in accuracy. We released KRISSBERT at https://aka.ms/krissbert.", "track": "Information Extraction", "label": 5}, {"loc": [5.008426189422607, 12.517354011535645], "id": 584, "title": "ARTIST: A Transformer-based Chinese Text-to-Image Synthesizer Digesting Linguistic and World Knowledge", "authors": "Tingting Liu, Chengyu Wang, Xiangru Zhu, Lei Li, Minghui Qiu, jun huang, Ming Gao and Yanghua Xiao", "abstract": "Text-to-Image Synthesis (TIS) is a popular task to convert natural language texts into realistic images. Recently, transformer-based TIS models (such as DALL-E) have been proposed using the encoder-decoder architectures. Yet, these billion-scale TIS models are difficult to tune and deploy in resource-constrained environments. In addition, there is a lack of language-specific TIS benchmarks for Chinese, together with high-performing models with moderate sizes. In this work, we present ARTIST, A tRansformer-based Chinese Text-to-Image SynThesizer for high-resolution image generation. In ARTIST, the rich linguistic and relational knowledge facts are injected into the model to ensure better model performance without the usage of ultra-large models. We further establish a large-scale Chinese TIS benchmark with the re-production results of state-of-the-art transformer-based TIS models.\nResults show ARTIST outperforms previous approaches.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.842288970947266, 9.322296142578125], "id": 585, "title": "From Spelling to Grammar: A New Framework for Chinese Grammatical Error Correction", "authors": "Xiuyu Wu and Yunfang Wu", "abstract": "Chinese Grammatical Error Correction (CGEC) aims to generate a correct sentence from an erroneous sequence, where different kinds of errors are mixed. This paper divides the CGEC task into two steps, namely spelling error correction and grammatical error correction. We firstly propose a novel zero-shot approach for spelling error correction, which is simple but effective, obtaining a high precision to avoid error accumulation of the pipeline structure. To handle grammatical error correction, we design part-of-speech (POS) features and semantic class features to enhance the neural network model, and propose an auxiliary task to predict the POS sequence of the target sentence. \nOur proposed framework achieves a 42.11 F-0.5 score on CGEC dataset without using any synthetic data or data augmentation methods, which outperforms the previous state-of-the-art by a wide margin of 1.30 points. Moreover, our model produces meaningful POS representations that capture different POS words and convey reasonable POS transition rules.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.486323356628418, 4.576927185058594], "id": 593, "title": "Language Models Are Poor Learners of Directional Inference", "authors": "Tianyi Li, Mohammad Javad Hosseini, Sabine Weber and Mark Steedman", "abstract": "We examine LMs' competence of directional predicate entailments by supervised fine-tuning with prompts. Our analysis shows that contrary to their apparent success on standard NLI, LMs show limited ability to learn such directional inference; moreover, existing datasets fail to test directionality, and/or are infested by artefacts that can be learnt as proxy for entailments, yielding over-optimistic results. In response, we present BoOQA (Boolean Open QA), a robust multi-lingual evaluation benchmark for directional predicate entailments, extrinsic to existing training sets. On BoOQA, we establish baselines and show evidence of existing LM-prompting models being incompetent directional entailment learners, in contrast to entailment graphs, however limited by sparsity.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.758921146392822, 6.938360214233398], "id": 595, "title": "Wish I Can Feel What You Feel: A Neural Approach for Empathetic Response Generation", "authors": "Yangbin Chen and Chunfeng Liang", "abstract": "Expressing empathy is important in everyday conversations, and exploring how empathy arises is crucial in automatic response generation. Most previous approaches consider only a single factor that affects empathy. However, in practice, empathy generation and expression is a very complex and dynamic psychological process. A listener needs to find out events which cause a speaker's emotions (emotion cause extraction), project the events into some experience (knowledge extension), and express empathy in the most appropriate way (communication mechanism).To this end, we propose a novel approach, which integrates the three components - emotion cause, knowledge graph, and communication mechanism for empathetic response generation.Experimental results on the benchmark dataset demonstrate the effectiveness of our method and show that incorporating the key components generates more informative and empathetic responses.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.320400238037109, 7.360043048858643], "id": 609, "title": "Measuring and Improving Semantic Diversity of Dialogue Generation", "authors": "Seungju Han, Beomsu Kim and Buru Chang", "abstract": "Response diversity has become an important criterion for evaluating the quality of open-domain dialogue generation models. However, current evaluation metrics for response diversity often fail to capture the semantic diversity of generated responses, as they mainly consider lexical aspects of the generated responses. In this paper, we introduce a new automatic evaluation metric to measure the semantic diversity of generated responses. Through human evaluation, we demonstrate that our proposed metric captures human judgments on response diversity better than existing lexical-level diversity metrics. Furthermore, motivated by analyzing an existing dialogue dataset, we propose a simple yet effective learning method that improves the semantic diversity of generated responses. Our learning method weights training samples based on the semantic distribution of the training set.\nWe show that our learning method improves response diversity and coherency better than other baseline methods through automatic and human evaluation.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.37562894821167, 12.172391891479492], "id": 612, "title": "Plug-and-Play VQA: Zero-shot VQA by Conjoining Large Pretrained Models with Zero Training", "authors": "Anthony Meng Huat Tiong, Junnan Li, Boyang Li, Silvio Savarese and Steven C.H. Hoi", "abstract": "Visual question answering (VQA) is a hallmark of vision and language reasoning\nand a challenging task under the zero-shot setting.\nWe propose Plug-and-Play VQA (PNP-VQA),\na modular framework for zero-shot VQA.\nIn contrast to most existing works, which require substantial adaptation of pretrained language models (PLMs) for the vision modality,\nPNP-VQA requires no additional training of the PLMs.\nInstead, we propose to use natural language and network interpretation as an intermediate representation that glues pretrained models together. We first generate question-guided informative image captions,\nand pass the captions to a PLM as context for question answering.\nSurpassing end-to-end trained baselines, PNP-VQA achieves state-of-the-art results on zero-shot VQAv2 and GQA. With 11B parameters, it outperforms the 80B-parameter Flamingo model by 8.5% on VQAv2. \nWith 738M PLM parameters, PNP-VQA achieves an improvement of 9.1% on GQA over FewVLM with 740M PLM parameters.", "track": "Question Answering", "label": 11}, {"loc": [4.736835956573486, 3.3000881671905518], "id": 613, "title": "TSGP: Two-Stage Generative Prompting for Unsupervised Commonsense Question Answering", "authors": "Yueqing Sun, Yu Zhang, Le Qi and Qi Shi", "abstract": "Without training on labeled task data, unsupervised commonsense question answering seems challenging since it requires commonsense knowledge beyond the context of questions. Previous methods typically retrieved from traditional knowledge bases or used pre-trained language models (PrLMs) to generate fixed types of knowledge, which have poor generalization ability.\nIn this paper, we aim to address the above limitation by leveraging the implicit knowledge stored in PrLMs and propose a two-stage prompt-based unsupervised commonsense question answering framework (TSGP). We first use knowledge generation prompts to generate the knowledge required for questions with unlimited types and possible candidate answers independent of specified choices. Then, we further utilize answer generation prompts to generate possible candidate answers independent of specified choices. Experimental results and analysis on three different commonsense reasoning tasks, CommonsenseQA, OpenBookQA, and SocialIQA, demonstrate that TSGP significantly improves the reasoning ability of language models in unsupervised settings.", "track": "Question Answering", "label": 11}, {"loc": [10.825733184814453, 6.994398593902588], "id": 623, "title": "Subword-Delimited Downsampling for Better Character-Level Translation", "authors": "Lukas Edman, Antonio Toral and Gertjan van Noord", "abstract": "Subword-level models have been the dominant paradigm in NLP. However, character-level models have the benefit of seeing each character individually, providing the model with more detailed information that ultimately could lead to better models. Recent works have shown character-level models to be competitive with subword models, but costly in terms of time and computation. Character-level models with a downsampling component alleviate this, but at the cost of quality, particularly for machine translation. \nThis work analyzes the problems of previous downsampling methods and introduces a novel downsampling method which is informed by subwords.\nThis new downsampling method not only outperforms existing downsampling methods, showing that downsampling characters can be done without sacrificing quality, but also leads to promising performance compared to subword models for translation.", "track": "Machine Translation", "label": 10}, {"loc": [1.1194785833358765, 8.092310905456543], "id": 624, "title": "Autoregressive Structured Prediction with Language Models", "authors": "Tianyu Liu, Yuchen Eleanor Jiang, Nicholas Monath, Ryan Cotterell and Mrinmaya Sachan", "abstract": "Recent years have seen a paradigm shift in NLP towards using pretrained language models ({PLM}) for a wide range of tasks.\n However, there are many difficult design decisions to represent structures (e.g. tagged text, coreference chains) in a way such that they can be captured by PLMs.\n Prior work on structured prediction with PLMs typically flattens the structured output into a sequence, which limits the quality of structural information being learned and leads to inferior performance compared to classic discriminative models. \n In this work, we describe an approach to model structures as sequences of actions in an autoregressive manner with PLMs, allowing in-structure dependencies to be learned without any loss. \n Our approach achieves the new state-of-the-art on all the structured prediction tasks we looked at, namely, named entity recognition, end-to-end relation extraction, and coreference resolution.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.099002361297607, 8.189377784729004], "id": 655, "title": "XDoc: Unified Pre-training for Cross-Format Document Understanding", "authors": "Jingye Chen, Tengchao Lv, Lei Cui, Cha Zhang and Furu Wei", "abstract": "The surge of pre-training has witnessed the rapid development of document understanding recently. Pre-training and fine-tuning framework has been effectively used to tackle texts in various formats, including plain texts, document texts, and web texts. Despite achieving promising performance, existing pre-trained models usually target one specific document format at one time, making it difficult to combine knowledge from multiple document formats. To address this, we propose XDoc, a unified pre-trained model which deals with different document formats in a single model. For parameter efficiency, we share backbone parameters for different formats such as the word embedding layer and the Transformer layers. Meanwhile, we introduce adaptive layers with lightweight parameters to enhance the distinction across different formats. Experimental results have demonstrated that with only 36.7\\% parameters, XDoc achieves comparable or even better performance on a variety of downstream tasks compared with the individual pre-trained models, which is cost effective for real-world deployment. The code and pre-trained models are publicly available at \\url{https://aka.ms/xdoc}.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [3.1851890087127686, 4.5890727043151855], "id": 658, "title": "A Few More Examples May Be Worth Billions of Parameters", "authors": "Yuval Kirstain, Patrick Lewis, Sebastian Riedel and Omer Levy", "abstract": "We investigate the dynamics of increasing the number of model parameters versus the number of labeled examples across a wide variety of tasks. Our exploration reveals that while scaling parameters consistently yields performance improvements, the contribution of additional examples highly depends on the task's format. Specifically, in open question answering tasks, enlarging the training set does not improve performance. In contrast, classification, extractive question answering, and multiple choice tasks benefit so much from additional examples that collecting a few hundred examples is often \"worth\" billions of parameters. We hypothesize that unlike open question answering, which involves recalling specific information, solving strategies for tasks with a more restricted output space transfer across examples, and can therefore be learned with small amounts of labeled data.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.210374355316162, 7.513924598693848], "id": 660, "title": "MCP: Self-supervised Pre-training for Personalized Chatbots with Multi-level Contrastive Sampling", "authors": "Zhaoheng Huang, Zhicheng Dou, Yutao Zhu and Zhengyi Ma", "abstract": "Personalized chatbots focus on endowing the chatbots with a consistent personality to behave like real users and further act as personal assistants. Previous studies have explored generating implicit user profiles from the user's dialogue history for building personalized chatbots. However, these studies only use the response generation loss to train the entire model, thus it is prone to suffer from the problem of data sparsity. Besides, they overemphasize the final generated response's quality while ignoring the correlations and fusions between the user's dialogue history, leading to rough data representations and performance degradation. To tackle these problems, we propose a self-supervised learning framework MCP for capturing better representations from users' dialogue history for personalized chatbots. Specifically, we apply contrastive sampling methods to leverage the supervised signals hidden in user dialog history, and generate the pre-training samples for enhancing the model. We design three pre-training tasks based on three types of contrastive pairs from user dialogue history, namely response pairs, sequence augmentation pairs, and user pairs. We pre-train the utterance encoder and the history encoder towards the contrastive objectives and use these pre-trained encoders for generating user profiles while personalized response generation. Experimental results on two real-world datasets show a significant improvement in our proposed model MCP compared with the existing methods.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [2.5974152088165283, 4.67689847946167], "id": 662, "title": "ExpertPLM: Pre-training Expert Representation for Expert Finding", "authors": "Qiyao Peng and Hongtao Liu", "abstract": "Expert Finding is an important task in Community Question Answering (CQA) platforms, which could help route questions to potential users to answer. The key is to learn representations of experts based on their historical answered questions accurately. In this paper, inspired by the strong text understanding ability of Pretrained Language modelings (PLMs), we propose a pre-training and fine-tuning expert finding framework. The core is that we design an expert-level pre-training paradigm, that effectively integrates expert interest and expertise simultaneously. Specifically different from the typical corpus-level pre-training, we treat each expert as the basic pre-training unit including all the historical answered question titles of the expert, which could fully indicate the expert interests for questions. Besides, we integrate the vote score information along with each answer of the expert into the pre-training phrase to model the expert ability explicitly. Finally, we propose a novel reputation-augmented Masked Language Model (MLM) pre-training strategy to capture the expert reputation information. In this way, our method could learn expert representation comprehensively, which then will be adopted and fine-tuned in the down-streaming expert-finding task. Extensive experimental results on six real-world CQA datasets demonstrate the effectiveness of our method.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.0651936531066895, 7.007589340209961], "id": 664, "title": "You Truly Understand What I Need : Intellectual and Friendly Dialog Agents grounding Persona and Knowledge", "authors": "Jungwoo Lim, Myugnhoon Kang, Yuna Hur, Seung Won Jeong, Jinsung Kim, Yoonna Jang, Dongyub Lee, Hyesung Ji, DongHoon Shin, Seungryong Kim and Heuiseok Lim", "abstract": "To build a conversational agent that interacts fluently with humans, previous studies blend knowledge or personal profile into the pre-trained language model. However, the model that considers knowledge and persona at the same time is still limited, leading to hallucination and a passive way of using personas. We propose an effective dialogue agent that grounds external knowledge and persona simultaneously. The agent selects the proper knowledge and persona to use for generating the answers with our candidate scoring implemented with a poly-encoder. Then, our model generates the utterance with lesser hallucination and more engagingness utilizing retrieval augmented generation with knowledge-persona enhanced query. We conduct experiments on the persona-knowledge chat and achieve state-of-the-art performance in grounding and generation tasks on the automatic metrics. Moreover, we validate the answers from the models regarding hallucination and engagingness through human evaluation and qualitative results. We show our retriever's effectiveness in extracting relevant documents compared to the other previous retrievers, along with the comparison of multiple candidate scoring methods. Code is available at \\url{https://github.com/dlawjddn803/INFO}", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.922996997833252, 9.677489280700684], "id": 668, "title": "Faithful to the Document or to the World? Mitigating Hallucinations via Entity-Linked Knowledge in Abstractive Summarization", "authors": "Yue Dong, John Wieting and Pat Verga", "abstract": "Existing abstractive summarization systems are hampered by content hallucinations in which models generate text that is not directly inferable from the source alone. Annotations from prior work have shown that some of these hallucinations, while being `unfaithful' to the source, are nonetheless factual. Our analysis in this paper suggests that these factual hallucinations occur as a result of the prevalence of factual yet unfaithful entities in summarization datasets. We find that these entities are not aberrations, but instead examples of additional world knowledge being readily used to latently connect entities and concepts -- in this case connecting entities in the source document to those in the target summary. In our analysis and experiments, we demonstrate that connecting entities to an external knowledge base can lend provenance to many of these unfaithful yet factual entities, and further, this knowledge can be used to improve the factuality of summaries without simply making them more extractive.", "track": "Summarization", "label": 14}, {"loc": [8.072550773620605, 8.098310470581055], "id": 673, "title": "RL with KL penalties is better viewed as Bayesian inference", "authors": "Tomasz Korbak, Ethan Perez and Christopher L. Buckley", "abstract": "Reinforcement learning (RL) is frequently employed in fine-tuning large language models (LMs), such as GPT-3, to penalize them for undesirable features of generated sequences, such as offensiveness, social bias, harmfulness or falsehood. The RL formulation involves treating the LM as a policy and updating it to maximise the expected value of a reward function which captures human preferences, such as non-offensiveness. \n\nIn this paper, we analyze challenges associated with treating a language model as an RL policy and show how avoiding those challenges requires moving beyond the RL paradigm. We start by observing that the standard RL approach is flawed as an objective for fine-tuning LMs because it leads to distribution collapse: turning the LM into a degenerate distribution. \n\nThen, we analyze KL-regularised RL, a widely used recipe for fine-tuning LMs, which additionally constrains the fine-tuned LM to stay close to its original distribution in terms of Kullback-Leibler (KL) divergence. We show that KL-regularised RL is equivalent to variational inference: approximating a Bayesian posterior which specifies how to update a prior LM to conform with evidence provided by the reward function. We argue that this Bayesian inference view of KL-regularised RL is more insightful than the typically employed RL perspective. \n\nThe Bayesian inference view explains how KL-regularised RL avoids the distribution collapse problem and offers a first-principles derivation for its objective. While this objective happens to be equivalent to RL (with a particular choice of parametric reward), there exist other objectives for fine-tuning LMs which are no longer equivalent to RL. That observation leads to a more general point: RL is not an adequate formal framework for problems such as fine-tuning language models. These problems are best viewed as Bayesian inference: approximating a pre-defined target distribution.", "track": "Theme Track", "label": 18}, {"loc": [1.8155336380004883, 3.973254919052124], "id": 691, "title": "Evaluating Token-Level and Passage-Level Dense Retrieval Models for Math Information Retrieval", "authors": "Wei Zhong, Jheng-Hong Yang, YUQING XIE and Jimmy Lin", "abstract": "With the recent success of dense retrieval methods based on bi-encoders, studies have applied this approach to various interesting downstream retrieval tasks with good efficiency and in-domain effectiveness.\nRecently, we have also seen the presence of dense retrieval models in Math Information Retrieval (MIR) tasks,\nbut the most effective systems remain classic retrieval methods that consider hand-crafted structure features.\nIn this work, we try to combine the best of both worlds:\\ a well-defined structure search method for effective formula search and efficient bi-encoder dense retrieval models to capture contextual similarities.\nSpecifically, we have evaluated two representative bi-encoder models for token-level and passage-level dense retrieval on recent MIR tasks.\nOur results show that bi-encoder models are highly complementary to existing structure search methods, and we are able to advance the state-of-the-art on MIR datasets.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.051512241363525, 3.95515513420105], "id": 694, "title": "Multi-View Reasoning: Consistent Contrastive Learning for Math Word Problem", "authors": "Wenqi Zhang, Yongliang Shen, Yanna Ma, Xiaoxia Cheng, Zeqi Tan, Qingpeng Nong and Weiming Lu", "abstract": "Math word problem solver requires both precise relation reasoning about quantities in the text and reliable generation for the diverse equation. Current sequence-to-tree or relation extraction methods regard this only from a fixed view, struggling to simultaneously handle complex semantics and diverse equations. However, human solving naturally involves two consistent reasoning views: top-down and bottom-up, just as math equations also can be expressed in multiple equivalent forms: pre-order and post-order. We propose a multi-view consistent contrastive learning for a more complete semantics-to-equation mapping. The entire process is decoupled into two independent but consistent views: top-down decomposition and bottom-up construction, and the two reasoning views are aligned in multi-granularity for consistency, enhancing global generation and precise reasoning. Experiments on multiple datasets across two languages show our approach significantly outperforms the existing baselines, especially on complex problems. We also show after consistent alignment, multi-view can absorb the merits of both views and generate more diverse results consistent with the mathematical laws.", "track": "NLP Applications", "label": 0}, {"loc": [8.041875839233398, 9.677835464477539], "id": 697, "title": "Few-shot initializing of Active Learner via Meta-Learning", "authors": "Zi Long Zhu, Vikrant Yadav, Zubair Afzal and George Tsatsaronis", "abstract": "Despite the important evolutions in few-shot and zero-shot learning techniques, domain specific applications still require expert knowledge and significant effort in annotating and labeling a large volume of unstructured textual data. To mitigate this problem, active learning, and meta-learning attempt to reach a high performance with the least amount of labeled data. In this paper, we introduce a novel approach to combine both lines of work by initializing an active learner with meta-learned parameters obtained through meta-training on tasks similar to the target task during active learning. In this approach we use the pre-trained BERT as our text-encoder and meta-learn its parameters with LEOPARD, which extends the model-agnostic meta-learning method by generating task dependent softmax weights to enable learning across tasks with different number of classes. We demonstrate the effectiveness of our method by performing active learning on five natural language understanding tasks and six datasets with five different acquisition functions. We train two different meta-initializations, and we use the pre-trained BERT base initialization as baseline. We observe that our approach performs better than the baseline at low budget, especially when closely related tasks were present during meta-learning. Moreover, our results show that better performance in the initial phase, i.e., with fewer labeled samples, leads to better performance when larger acquisition batches are used. We also perform an ablation study of the proposed method, showing that active learning with only the meta-learned weights is beneficial and adding the meta-learned learning rates and generating the softmax have negative consequences for the performance.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [9.696150779724121, 7.728682994842529], "id": 700, "title": "Bootstrapping meaning through listening: Unsupervised learning of spoken sentence embeddings", "authors": "Jian Zhu, Zuoyu Tian, Yadong Liu, Cong Zhang and Chia-Wen Lo", "abstract": "Inducing semantic representations directly from speech signals is a highly challenging task but has many useful applications in speech mining and spoken language understanding. This study tackles the unsupervised learning of semantic representations for spoken utterances. Through converting speech signals into hidden units generated from acoustic unit discovery, we propose WavEmbed, a multimodal sequential autoencoder that predicts hidden units from a dense representation of speech. Secondly, we also propose S-HuBERT to induce meaning through knowledge distillation, in which a sentence embedding model is first trained on hidden units and passes its knowledge to a speech encoder through contrastive learning. The best performing model achieves a moderate correlation (0.5~0.6) with human judgments, without relying on any labels or transcriptions. Furthermore, these models can also be easily extended to leverage textual transcriptions of speech to learn much better speech embeddings that are strongly correlated with human annotations. Our proposed methods are applicable to the development of purely data-driven systems for speech mining, indexing and search.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [9.467338562011719, 6.337846279144287], "id": 712, "title": "Progressive Sentiment Analysis for Code-Switched Text Data", "authors": "Sudhanshu Ranjan, Dheeraj Mekala and Jingbo Shang", "abstract": "Multilingual transformer language models have recently attracted much attention from researchers and are used in cross-lingual transfer learning for many NLP tasks such as text classification and named entity recognition.\nHowever, similar methods for transfer learning from monolingual text to code-switched text have not been extensively explored mainly due to the following challenges:\n(1) Code-switched corpus, unlike monolingual corpus, consists of more than one language and existing methods can't be applied efficiently,\n(2) Code-switched corpus is usually made of resource-rich and low-resource languages and upon using multilingual pre-trained language models, the final model might bias towards resource-rich language. \nIn this paper, we focus on code-switched sentiment analysis where we have a labelled resource-rich language dataset and unlabelled code-switched data. We propose a framework that takes the distinction between resource-rich and low-resource language into account.\nInstead of training on the entire code-switched corpus at once, we create buckets based on the fraction of words in the resource-rich language and progressively train from resource-rich language dominated samples to low-resource language dominated samples. \nExtensive experiments across multiple language pairs demonstrate that progressive training helps low-resource language dominated samples.", "track": "Multilinguality", "label": 13}, {"loc": [4.075523376464844, 5.934412956237793], "id": 714, "title": "Knowledge Stimulated Contrastive Prompting for Low-Resource Stance Detection", "authors": "Kai Zheng, Qingfeng Sun, Yaming Yang and Fei Xu", "abstract": "Stance Detection Task (SDT) aims at identifying the stance of the sentence towards a specific target and is usually modeled as a classification problem. Backgound knowledge is often necessary for stance detection with respect to a specific target, especially when there is no target explicitly mentioned in text. This paper focuses on the knowledge stimulation for low-resource stance detection tasks. We firstly explore to formalize stance detection as a prompt based contrastive learning task. At the same time, to make prompt learning suit to stance detection, we design a template mechanism to incorporate corresponding target into instance representation. Furthermore, we propose a masked language prompt joint contrastive learning approach to stimulate the knowledge inherit from the pre-trained model. The experimental results on three benchmarks show that knowledge stimulation is effective in stance detection accompanied with our proposed mechanism.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [10.827001571655273, 9.343551635742188], "id": 745, "title": "WSpeller: Robust Word Segmentation for Enhancing Chinese Spelling Check", "authors": "Fangfang Li, Youran Shan, Junwen Duan, Xingliang Mao and Minlie Huang", "abstract": "Chinese spelling check (CSC) detects and corrects spelling errors in Chinese texts. Previous approaches have combined character-level phonetic and graphic information, ignoring the importance of segment-level information. According to our pilot study, spelling errors are always associated with incorrect word segmentation. When appropriate word boundaries are provided, CSC performance is greatly enhanced. Based on these findings, we present WSpeller, a CSC model that takes into account word segmentation. A fundamental component of WSpeller is a W-MLM, which is trained by predicting visually and phonetically similar words. Through modification of the embedding layer's input, word segmentation information can be incorporated. Additionally, a robust module is trained to assist the W-MLM-based correction module by predicting the correct word segmentations from sentences containing spelling errors. We evaluate WSpeller on the widely used benchmark datasets SIGHAN13, SIGHAN14, and SIGHAN15. Our model is superior to state-of-the-art baselines on SIGHAN13 and SIGHAN15 and maintains equal performance on SIGHAN14.", "track": "NLP Applications", "label": 0}, {"loc": [2.1115283966064453, 7.567514419555664], "id": 764, "title": "Extracting Trigger-sharing Events via an Event Matrix", "authors": "Jun Xu, Weidi Xu, Mengshu Sun, Taifeng Wang and Wei Chu", "abstract": "A growing interest emerges in event extraction which aims to extract multiple events with triggers and arguments. Previous methods mitigate the problem of multiple events extraction by predicting the arguments conditioned on the event trigger and event type, assuming that these arguments belong to a single event. However, the assumption is invalid in general as there may be multiple events. Therefore, we present a unified framework called MatEE for trigger-sharing events extraction. It resolves the kernel bottleneck by effectively modeling the relations between arguments by an event matrix, where trigger-sharing events are represented by multiple cliques. We verify the proposed method on 3 widely-used benchmark datasets of event extraction. The experimental results show that it beats all the advanced competitors, significantly improving the state-of-the-art performances in event extraction.", "track": "Information Extraction", "label": 5}, {"loc": [0.5011634230613708, 7.111895561218262], "id": 770, "title": "TranS: Transition-based Knowledge Graph Embedding with Synthetic Relation Representation", "authors": "Xuanyu Zhang, Qing Yang and Dongliang Xu", "abstract": "Knowledge graph embedding (KGE) aims to learn continuous vectors of relations and entities in knowledge graph (KG). Recently, transition-based KGE methods have become popular and achieved promising performance. However, scoring patterns like TransE are not suitable for complex scenarios where the same entity pair has different relations. Although some models attempt to employ entity-relation interaction or projection to improve entity representation for one-to-many/many-to-one/many-to-many complex relations, they still continue the TransE pattern, where only a single relation vector in the relation part is used to translate the head entity to the tail entity or their variants. And recent research shows that entity representation only needs to consider entities and their interactions to achieve better performance. Thus, in this paper, we propose a novel transition-based method, TranS, for KGE. The single relation vector of the relation part in traditional scoring patterns is replaced by synthetic relation representation with entity-relation interaction to solve these issues. And the entity part still retains its independence through entity-entity interactions. Experiments on a large KG dataset, ogbl-wikikg2, show that our model achieves state-of-the-art results.", "track": "Information Extraction", "label": 5}, {"loc": [3.960214376449585, 7.533139705657959], "id": 789, "title": "Sequential Topic Selection Model with Latent Variable for Topic-Grounded Dialogue", "authors": "Xiao-Fei Wen, Wei Wei and Xian-Ling Mao", "abstract": "Recently, topic-grounded dialogue system has attracted significant attention due to its effectiveness in predicting the next topic to yield better responses via the historical context and given topic sequence. However, almost all existing topic prediction solutions focus on only the current conversation and corresponding topic sequence to predict the next conversation topic, without exploiting other topic-guided conversations which may contain relevant topic-transitions to current conversation. To address the problem, in this paper we propose a novel approach, named Sequential Global Topic Attention (SGTA) to exploit topic transition over all conversations in a subtle way for better modeling post-to-response topic-transition and guiding the response generation to the current conversation. Specifically, we introduce a latent space modeled as a Multivariate Skew-Normal distribution with hybrid kernel functions to flexibly integrate the global-level information with sequence-level information, and predict the topic based on the distribution sampling results. We also leverage a topic-aware prior-posterior approach for secondary selection of predicted topics, which is utilized to optimize the response generation task. Extensive experiments demonstrate that our model outperforms competitive baselines on prediction and generation tasks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.544554233551025, 7.478976249694824], "id": 791, "title": "Robust Task-Oriented Dialogue Generation with Contrastive Pre-training and Adversarial Filtering", "authors": "Shiquan Yang, Xinting Huang, Jey Han Lau and Sarah Erfani", "abstract": "Data artifacts incentivize machine learning models to learn non-transferable generalizations by taking advantage of shortcuts in the data, and\nthere is growing evidence that data artifacts play a role for the strong results that \ndeep learning models achieve in recent natural language processing benchmarks.\nIn this paper, we focus on task-oriented dialogue and investigate whether popular \ndatasets such as MultiWOZ contain such data artifacts.\nWe found that by only keeping frequent phrases in the training\nexamples, state-of-the-art models perform similarly compared \nto the variant trained with full data, suggesting \nthey exploit these spurious correlations\nto solve the task. Motivated by this, we propose \na contrastive learning based framework to encourage the model \nto ignore these cues and focus on learning generalisable patterns. We also experiment with adversarial filtering to remove easy training instances so that the model would focus on learning from the harder instances. We conduct a number of generalization\n experiments --- e.g., cross-domain/dataset and adversarial tests --- to assess the robustness of our approach and found that it works exceptionally well.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.3026673793792725, 4.905750274658203], "id": 806, "title": "STAR: SQL Guided Pre-Training for Context-dependent Text-to-SQL Parsing", "authors": "Zefeng Cai, Xiangyu Li, Binyuan Hui, Min Yang, Bowen Li, Binhua Li, Zheng Cao, weijie li, Fei Huang, Luo Si and Yongbin Li", "abstract": "In this paper, we propose a novel SQL guided pre-training framework STAR for context-dependent text-to-SQL parsing, which leverages contextual information to enrich natural language (NL) utterance and table schema representations for text-to-SQL conversations. Concretely, we propose two novel pre-training objectives which respectively explore the context-dependent interactions of NL utterances and SQL queries within each text-to-SQL conversation: (i) schema state tracking (SST) objective that tracks and explores the schema states of context-dependent SQL queries in the form of schema-states by predicting and updating the value of each schema slot during interaction; (ii) utterance dependency tracking (UDT) objective that employs weighted contrastive learning to pull together two semantically similar NL utterances and push away the representations of semantically dissimilar NL utterances within each conversation. In addition, we construct a high-quality large-scale context-dependent text-to-SQL conversation corpus to pre-train STAR. Extensive experiments show that STAR achieves new state-of-the-art performance on two downstream benchmarks (SParC and CoSQL), significantly outperforming previous pre-training methods and ranking first on the leaderboard. We believe the release of the constructed corpus, codebase and pre-trained STAR checkpoints would push forward the research in this area.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [4.223868370056152, 7.318996906280518], "id": 809, "title": "Is MultiWOZ a Solved Task? An Interactive TOD Evaluation Framework with User Simulator", "authors": "Qinyuan Cheng, Linyang Li, Guofeng Quan, Feng Gao, xiaofeng mou and Xipeng Qiu", "abstract": "Task-Oriented Dialogue (TOD) systems are drawing more and more attention in recent studies.\nCurrent methods focus on constructing pre-trained models or fine-tuning strategies while the evaluation of TOD is limited by a policy mismatch problem.\nThat is, during evaluation, the user utterances are from the annotated dataset while these utterances should interact with previous responses which can have many alternatives besides annotated texts.\nTherefore, in this work, we propose an interactive evaluation framework for TOD. \nWe first build a goal-oriented user simulator based on pre-trained models and then use the user simulator to interact with the dialogue system to generate dialogues.\nBesides, we introduce a sentence-level and a session-level score to measure the sentence fluency and session coherence in the interactive evaluation. \nExperimental results show that RL-based TOD systems trained by our proposed user simulator can achieve nearly 98\\% inform and success rates in the interactive evaluation of MultiWOZ dataset and the proposed scores measure the response quality besides the inform and success rates.\nWe are hoping that our work will encourage simulator-based interactive evaluations in the TOD task.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.369902610778809, 6.783526420593262], "id": 820, "title": "Translating Hanja Historical Documents to Contemporary Korean and English", "authors": "juhee son, Jiho Jin, Haneul Yoo, JinYeong Bak, Kyunghyun Cho and Alice Oh", "abstract": "The Annals of Joseon Dynasty (AJD) contain the daily records of the Kings of Joseon, the 500-year kingdom preceding the modern nation of Korea.\nThe Annals were originally written in an archaic Korean writing system, `Hanja', and were translated into Korean from 1968 to 1993.\nThe resulting translation was however too literal and contained many archaic Korean words; thus, a new expert translation effort began in 2012. Since then, the records of only one king have been completed in a decade.\nIn parallel, expert translators are working on English translation, also at a slow pace and produced only one king's records in English so far.\nThus, we propose H2KE, a neural machine translation model, that translates historical documents in Hanja to more easily understandable Korean and to English.\nBuilt on top of multilingual neural machine translation, H2KE learns to translate a historical document written in Hanja, from both a full dataset of outdated Korean translation and a small dataset of more recently translated contemporary Korean and English.\nWe compare our method against two baselines:\na recent model that simultaneously learns to restore and translate Hanja historical document\nand a Transformer based model trained only on newly translated corpora.\nThe experiments reveal that our method significantly outperforms the baselines in terms of BLEU scores for both contemporary Korean and English translations.\nWe further conduct extensive human evaluation which shows that our translation is preferred over the original expert translations by both experts and non-expert Korean speakers.", "track": "Machine Translation", "label": 10}, {"loc": [5.7967848777771, 11.88735294342041], "id": 835, "title": "Exploring Compositional Image Retrieval with Hybrid Compositional Learning and Heuristic Negative Mining", "authors": "Chao Wang, Ehsan Nezhadarya, Tanmana Sadhu and Shengdong Zhang", "abstract": "Compositional image retrieval (CIR) is a challenging retrieval task, where the query is composed of a reference image and a modification text, and the target is another image reflecting the modification to the reference image. Due to the great success of the pre-trained vision-and-language model CLIP and its favorable applicability to large-scale retrieval tasks, we propose a CIR model HyCoLe-HNM with CLIP as the backbone. In HyCoLe-HNM, we follow the contrastive pre-training method of CLIP to perform cross-modal representation learning. On this basis, we propose a hybrid compositional learning mechanism, which includes both image compositional learning and text compositional learning. In hybrid compositional learning, we borrow a gated fusion mechanism from a question answering model to perform compositional fusion, and propose a heuristic negative mining method to filter negative samples. Privileged information in the form of image-related texts is utilized in cross-modal representation learning and hybrid compositional learning. Experimental results show that HyCoLe-HNM achieves state-of-the-art performance on three CIR datasets, namely FashionIQ, Fashion200K, and MIT-States.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.43426513671875, 7.733405590057373], "id": 847, "title": "Outlier Dimensions that Disrupt Transformers are Driven by Frequency", "authors": "Giovanni Puccetti, Anna Rogers, Aleksandr Drozd and Felice Dell'Orletta", "abstract": "While Transformer-based language models are generally very robust to pruning, there is the recently discovered outlier phenomenon: disabling only 48 out of 110M parameters in BERT-base drops its performance by nearly 30% on MNLI. We replicate the original evidence for the outlier phenomenon and we link it to the geometry of the embedding space. We find that in both BERT and RoBERTa the magnitude of hidden state coefficients corresponding to outlier dimensions correlate with the frequencies of encoded tokens in pre-training data, and they also contribute to the \"vertical\" self-attention pattern enabling the model to focus on the special tokens. This explains the drop in performance from disabling the outliers, and it suggests that to decrease anisotopicity in future models we need pre-training schemas that would better take into account the skewed token distributions.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.095840930938721, 5.199019432067871], "id": 849, "title": "MiST: a Large-Scale Annotated Resource and Neural Models for Functions of Modal Verbs in English Scientific Text", "authors": "Sophie Henning, Nicole Macher, Stefan Gr\u00fcnewald and Annemarie Friedrich", "abstract": "Modal verbs (e.g., can, should or must) occur highly frequently in scientific articles. Decoding their function is not straightforward: they are often used for hedging, but they may also denote abilities and restrictions. Understanding their meaning is important for accurate information extraction from scientific text.\n\nTo foster research on the usage of modals in this genre, we introduce the MIST (Modals In Scientific Text) dataset, which contains 3737 modal instances in five scientific domains annotated for their semantic, pragmatic, or rhetorical function. We systematically evaluate a set of competitive neural architectures on MIST. Transfer experiments reveal that leveraging non-scientific data is of limited benefit for modeling the distinctions in MIST. Our corpus analysis provides evidence that scientific communities differ in their usage of modal verbs, yet, classifiers trained on scientific data generalize to some extent to unseen scientific domains.", "track": "Ethic Concerns:Discourse and Pragmatics", "label": 24}, {"loc": [8.27032470703125, 8.741168022155762], "id": 853, "title": "Late Prompt Tuning: A Late Prompt Could Be Better Than Many Prompts", "authors": "Xiangyang Liu, Tianxiang Sun, Xuanjing Huang and Xipeng Qiu", "abstract": "Prompt tuning is a parameter-efficient tuning (PETuning) method for utilizing pre-trained models (PTMs) that simply prepends a soft prompt to the input and only optimizes the prompt to adapt PTMs to downstream tasks. Although it is parameter- and deployment-efficient, its performance still lags behind other state-of-the-art PETuning methods. Besides, the training cost of prompt tuning is not significantly reduced due to the back-propagation through the entire model. Through empirical analyses, we shed some light on the lagging performance of prompt tuning and recognize a trade-off between the propagation distance from label signals to the inserted prompt and the influence of the prompt on model outputs. Further, we present Late Prompt Tuning (LPT) that inserts a late prompt into an intermediate layer of the PTM instead of the input layer or all layers. The late prompt is obtained by a neural prompt generator conditioned on the hidden states before the prompt insertion layer and therefore is instance-dependent. Through extensive experimental results across various tasks and PTMs, we show that LPT can achieve competitive performance to full model tuning and other PETuning methods under both full-data and few-shot scenarios while possessing faster training speed and lower memory cost.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.79926061630249, 3.299973249435425], "id": 855, "title": "MICO: A Multi-alternative Contrastive Learning Framework for Commonsense Knowledge Representation", "authors": "Ying Su, Zihao Wang, Tianqing Fang, Hongming Zhang, Yangqiu Song and Tong Zhang", "abstract": "Commonsense reasoning tasks such as commonsense knowledge graph completion and commonsense question answering require powerful representation learning. In this paper, we propose to learn commonsense knowledge representation by MICO, a Multi-alternative contrastIve learning framework on COmmonsense knowledge graphs (MICO). MICO generates the commonsense knowledge representation by contextual interaction between entity nodes and relations with multi-alternative contrastive learning. \nIn MICO, the head and tail entities in an $(h,r,t)$ knowledge triple are converted to two relation-aware sequence pairs (a premise and an alternative) in the form of natural language. Semantic representations generated by MICO can benefit the following two tasks by simply comparing the similarity score between the representations: 1) zero-shot commonsense question answering tasks; 2) inductive commonsense knowledge graph completion tasks. Extensive experiments show the effectiveness of our method.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [0.9745473861694336, 10.470931053161621], "id": 856, "title": "Leveraging Only the Category Name for Aspect Detection through Prompt-based Constrained Clustering", "authors": "Yazheng Li, Pengyun Wang, Yasheng Wang, Yong Dai, Yadao Wang, Lujia Pan and Zenglin Xu", "abstract": "Aspect category detection (ACD) aims to automatically identify user-concerned aspects from online reviews, which is of great value for evaluating the fine-grained performance of a product. The most recent solutions tackle this problem via weakly supervised methods, achieving remarkable improvement over unsupervised methods. However, a closer look at these methods reveals that the required human efforts are nontrivial and can sometimes be hard to obtain. In this study, we explore the possibility of minimizing human guidance while improving detection performance, with a deep clustering method that relies merely on the category name of each aspect and a pretrained language model (LM). The LM, combined with prompt techniques, is employed as a knowledge base to automatically generate constraints for clustering, as well as to provide a representation space to perform the clustering. Our method (1) extracts extensive keywords to expand our understanding of each aspect, (2) automatically generates instance-level and concept-level constraints for clustering, and (3) trains the clustering model with the above constraints. We demonstrate the capability of the proposed framework through extensive experiments on nine benchmark datasets. Our model not only performs noticeably better than existing unsupervised approaches but also considerably surpasses weakly supervised methods that require more human efforts.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.131096839904785, 7.070195198059082], "id": 858, "title": "Controllable Factuality in Document-Grounded Dialog Systems Using a Noisy Channel Model", "authors": "Nico Daheim, David Thulke, Christian Dugast and Hermann Ney", "abstract": "In this work, we present a model for document-grounded response generation in dialog that is decomposed into two components according to Bayes' theorem.\nOne component is a traditional ungrounded response generation model and the other component models the reconstruction of the grounding document based on the dialog context and generated response.\nWe propose different approximate decoding schemes and evaluate our approach on multiple open-domain and task-oriented document-grounded dialog datasets.\nOur experiments show that the model is more factual in terms of automatic factuality metrics than the baseline model.\nFurthermore, we outline how introducing scaling factors between the components allows for controlling the tradeoff between factuality and fluency in the model output.\nFinally, we compare our approach to a recently proposed method to control factuality in grounded dialog, CTRL (Rashkin et al., 2021), and show that both approaches can be combined to achieve additional improvements.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.491815567016602, 7.826846122741699], "id": 868, "title": "Transformer Language Models without Positional Encodings Still Learn Positional Information", "authors": "Adi Haviv, Ori Ram, Ofir Press, Peter Izsak and Omer Levy", "abstract": "Causal transformer language models (LMs), such as GPT-3, typically require some form of positional encoding, such as positional embeddings. \nHowever, we show that LMs without any explicit positional encoding are still competitive with standard models and that this phenomenon is robust across different datasets, model sizes, and sequence lengths.\nProbing experiments reveal that such models acquire an implicit notion of absolute positions throughout the network, effectively compensating for the missing information.\nWe conjecture that causal attention enables the model to infer the number of predecessors that each token can attend to, thereby approximating its absolute position.\nOur findings indicate that causal LMs might derive positional awareness not only from the explicit positioning mechanism but also from the effects of the causal mask.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.905762195587158, 5.071253299713135], "id": 870, "title": "Beyond Model Interpretability: On the Faithfulness and Adversarial Robustness of Contrastive Textual Explanations", "authors": "Julia El Zini and Mariette Awad", "abstract": "Contrastive explanation methods go beyond transparency and address the contrastive aspect of explanations. Such explanations are emerging as an attractive option to provide actionable change to scenarios adversely impacted by classifiers' decisions. However, their extension to textual data is under-explored and there is little investigation on their vulnerabilities and limitations. \n\nThis work motivates textual counterfactuals by highlighting the social limitations of non-contrastive explainability. We also lay the ground for a novel evaluation scheme inspired by the faithfulness of explanations. Accordingly, we extend the computation of three metrics, proximity, connectedness and stability, to textual data and we benchmark two successful contrastive methods, POLYJUICE and MiCE, on our suggested metrics. Experiments on sentiment analysis data show that the connectedness of counterfactuals to their original counterparts is not obvious in both models. More interestingly, the generated contrastive texts are more attainable with POLYJUICE which highlights the significance of latent representations in counterfactual search. Finally, we perform the first semantic adversarial attack on textual recourse methods. The results demonstrate the robustness of POLYJUICE and the role that latent input representations play in robustness and reliability.", "track": "Ethics", "label": 21}, {"loc": [8.539549827575684, 7.848979949951172], "id": 871, "title": "How Much Does Attention Actually Attend? Questioning the Importance of Attention in Pretrained Transformers", "authors": "Michael Hassid, Hao Peng, Daniel Rotem, Jungo Kasai, Ivan Montero, Noah A. Smith and Roy Schwartz", "abstract": "The attention mechanism is considered the backbone of the widely-used Transformer architecture. It contextualizes the input by computing input-specific attention matrices. We find that this mechanism, while powerful and elegant, is not as important as typically thought for pretrained language models. We introduce PAPA, a new probing method that replaces the input-dependent attention matrices with constant ones---the average attention weights over multiple inputs. We use PAPA to analyze several established pretrained Transformers on six downstream tasks. We find that without any input-dependent attention, all models achieve competitive performance---an average relative drop of only 8% from the probing baseline. Further, little or no performance drop is observed when replacing half of the input-dependent attention matrices with constant (input-independent) ones. Interestingly, we show that better-performing models lose more from applying our method than weaker models, suggesting that the utilization of the input-dependent attention mechanism might be a factor in their success. Our results motivate research on simpler alternatives to input-dependent attention, as well as on methods for better utilization of this mechanism in the Transformer architecture.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.299374103546143, 7.5945210456848145], "id": 874, "title": "What Has Been Enhanced in my Knowledge-Enhanced Language Model?", "authors": "Yifan Hou, Guoji Fu and Mrinmaya Sachan", "abstract": "A number of knowledge integration (KI) methods have recently been proposed to incorporate external knowledge into pretrained language models (LMs). Even though knowledge-enhanced LMs (KELMs) outperform base LMs on knowledge-intensive tasks, the inner-workings of these KI methods are not well-understood. For instance, it is unclear which knowledge is effectively integrated into KELMs and which is not; and if such integration led to catastrophic forgetting of already learned knowledge. We show that existing model interpretation methods such as linear probes and prompts have some key limitations in answering these questions. Then, we revisit KI from an information-theoretic view and propose a new theoretically sound probe model called Graph Convolution Simulator (GCS) for KI interpretation. GCS is eventually quite simple -- it uses graph attention on the corresponding knowledge graph for interpretation.\nWe conduct various experiments to verify that GCS provides reasonable interpretation results for two well-known KELMs: ERNIE and K-Adapter. Our experiments reveal that only little knowledge is successfully integrated in these models, and simply increasing the size of the KI corpus may not lead to better KELMs.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.2823032140731812, 7.9568328857421875], "id": 878, "title": "Towards Generalized Open Information Extraction", "authors": "Bowen Yu", "abstract": "Open Information Extraction (OpenIE) facilitates the open-domain discovery of textual facts. However, the prevailing solutions evaluate OpenIE models on in-domain test sets aside from the training corpus, which certainly violates the initial task principle of domain-independence. In this paper, we propose to advance OpenIE towards a more realistic scenario: generalizing over unseen target domains with different data distributions from the source training domains, termed Generalized OpenIE. For this purpose, we first introduce GLOBE, a large-scale human-annotated multi-domain OpenIE benchmark, to examine the robustness of recent OpenIE models to domain shifts, and the relative performance degradation of up to 70\\% implies the challenges of generalized OpenIE. Then, we propose DragonIE, which explores a minimalist expression of textual fact: directed acyclic graph, to improve the OpenIE generalization ability. Extensive experiments demonstrate that DragonIE beats the previous methods in both in-domain and out-of-domain settings by as much as 6.0% in F1 score absolutely, but there is still ample room for improvement.", "track": "Information Extraction", "label": 5}, {"loc": [2.390735149383545, 8.668560028076172], "id": 880, "title": "BioLORD: Learning Ontological Representations from Definitions for Biomedical Concepts and their Textual Descriptions", "authors": "Fran\u00e7ois Remy, Kris Demuynck and Thomas Demeester", "abstract": "This work introduces BioLORD, a new pre-training strategy for producing meaningful representations for clinical sentences and biomedical concepts. State-of-the-art methodologies operate by maximizing the similarity in representation of names referring to the same concept, and preventing collapse through contrastive learning. However, because biomedical names are not always self-explanatory, it sometimes results in non-semantic representations. BioLORD overcomes this issue by grounding its concept representations using definitions, as well as short descriptions derived from a multi-relational knowledge graph consisting of biomedical ontologies. Thanks to this grounding, our model produces more semantic concept representations that match more closely the hierarchical structure of ontologies. BioLORD establishes a new state of the art for text similarity on both clinical sentences (MedSTS) and biomedical concepts (MayoSRS).", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.285796165466309, 6.570565700531006], "id": 881, "title": "Improving the Extraction of Supertags for Constituency Parsing with Linear Context-Free Rewriting Systems", "authors": "Thomas Ruprecht", "abstract": "In parsing phrase structures, supertagging achieves a symbiosis between the interpretability of formal grammars and the accuracy and speed of more recent neural models.\nThe approach was only recently transferred to parsing discontinuous constituency structures with linear context-free rewriting systems (LCFRS).\nWe reformulate and parameterize the previously fixed extraction process for LCFRS supertags with the aim to improve the overall parsing quality.\nThese parameters are set in the context of several steps in the extraction process and are used to control the granularity of extracted grammar rules as well as the association of lexical symbols with each supertag.\nWe evaluate the influence of the parameters on the sets of extracted supertags and the parsing quality using three treebanks in the English and German language, and we compare the best-performing configurations to recent state-of-the-art parsers in the area.\nOur results show that some of our configurations and the slightly modified parsing process improve the quality and speed of parsing with our supertags over the previous approach.\nMoreover, we achieve parsing scores that either surpass or are among the state-of-the-art in discontinuous constituent parsing.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [8.090771675109863, 7.550258159637451], "id": 892, "title": "Mask More and Mask Later: Efficient Pre-training of Masked Language Models by Disentangling the [MASK] Token", "authors": "Baohao Liao, David Thulke, Sanjika Hewavitharana, Hermann Ney and Christof C. Monz", "abstract": "The pre-training of masked language models (MLMs) consumes massive computation to achieve good results on downstream NLP tasks, resulting in a large carbon footprint. In the vanilla MLM, the virtual tokens, [MASK]s, act as placeholders and gather the contextualized information from unmasked tokens to restore the corrupted information. It raises the question of whether we can append [MASK]s at a later layer, to reduce the sequence length for earlier layers and make the pre-training more efficient. We show: (1) [MASK]s can indeed be appended at a later layer, being disentangled from the word embedding; (2) The gathering of contextualized information from unmasked tokens can be conducted with a few layers. By further increasing the masking rate from 15\\% to 50\\%, we can pre-train RoBERTa-base and RoBERTa-large from scratch with only 78\\% and 68\\% of the original computational budget without any degradation on the GLUE benchmark. When pre-training with the original budget, our method outperforms RoBERTa for 6 out of 8 GLUE tasks, on average by 0.4\\%.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.414797306060791, 5.805326461791992], "id": 907, "title": "SMSMix: Sense-Maintained Sentence Mixup for Word Sense Disambiguation", "authors": "Hee Suk Yoon, Eunseop Yoon, John Harvill, Sunjae Yoon, Mark Hasegawa-Johnson and Chang D. Yoo", "abstract": "Word Sense Disambiguation (WSD) is an NLP task aimed at determining the correct sense of a word in a sentence from discrete sense choices. Although current systems have attained unprecedented performances for such tasks, the nonuniform distribution of word senses during training generally results in systems performing poorly on rare senses. To this end, we consider data augmentation to increase the frequency of these least frequent senses (LFS) to reduce the distributional bias of senses during training. We propose Sense-Maintained Sentence Mixup (SMSMix), a novel word-level mixup method that maintains the sense of a target word. SMSMix smoothly blends two sentences using mask prediction while preserving the relevant span determined by saliency scores to maintain a specific word's sense. To the best of our knowledge, this is the first attempt to apply mixup in NLP while preserving the meaning of a specific word. With extensive experiments, we validate that our augmentation method can effectively give more information about rare senses during training with maintained target sense label.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.544996738433838, 8.510421752929688], "id": 934, "title": "On the Effectiveness of Automated Metrics for Text Generation Systems", "authors": "Pius von D\u00e4niken, Jan Deriu, Don Tuggener and Mark Cieliebak", "abstract": "A major challenge in the field of Text Generation is evaluation, because we lack a sound theory that can be leveraged to extract guidelines for evaluation campaigns. In this work, we propose a first step towards such a theory that incorporates different sources of uncertainty, such as imperfect automated metrics and insufficiently sized test sets. The theory has practical applications, such as determining the number of samples needed to reliably distinguish the performance of a set of Text Generation systems in a given setting. \nWe showcase the application of the theory on the WMT 21 and Spot-The-Bot evaluation data and outline how it can be leveraged to improve the evaluation protocol regarding the reliability, robustness, and significance of the evaluation outcome.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.1626482009887695, 8.694008827209473], "id": 941, "title": "Residual Learning of Neural Text Generation with n-gram Language Model", "authors": "Huayang Li, Deng Cai, Jin Xu and Taro Watanabe", "abstract": "N-gram language models (LM) has been largely superseded by neural LMs as the latter exhibits better performance. However, we find that n-gram models can achieve satisfactory performance on a large proportion of testing cases, indicating they have already captured abundant knowledge of the language with relatively low computational cost. With this observation, we propose to learn a neural LM that fits the residual between an n-gram LM and the real-data distribution. The combination of n-gram LMs and neural LMs not only allows the neural part to focus on deeper understanding of the language, but also provides a flexible way to customize a LM by switching the underlying n-gram model without changing the neural model. Experimental results on three typical language tasks (i.e., language modeling, machine translation, and summarization) demonstrate that our approach attains additional performance gains over popular standalone neural models consistently. We also show that our approach allows for effective domain adaptation by simply switching to a domain-specific n-gram model, without any extra training.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.612253189086914, 9.553153038024902], "id": 943, "title": "DiffG-RL: Leveraging Difference between Environment State and Common Sense", "authors": "Tsunehiko Tanaka, Daiki Kimura and Michiaki Tatsubori", "abstract": "Taking into account background knowledge as the context has always been an important part of solving tasks that involve natural language. One representative example of such tasks is text-based games, where players need to make decisions based on both description text previously shown in the game, and their own background knowledge about the language and common sense. In this work, we investigate not simply giving common sense, as can be seen in prior research, but also its effective usage. We assume that a part of the environment states different from common sense should constitute one of the grounds for action selection. We propose a novel agent, DiffG-RL, which constructs a Difference Graph that organizes the environment states and common sense by means of interactive objects with a dedicated graph encoder. DiffG-RL also contains a framework for extracting the appropriate amount and representation of common sense from the source to support the construction of the graph. We validate DiffG-RL in experiments with text-based games that require common sense and show that it outperforms baselines by 17% of scores. We will make our code publicly available.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [8.150410652160645, 5.2382001876831055], "id": 949, "title": "Unsupervised Syntactically Controlled Paraphrase Generation with Abstract Meaning Representations", "authors": "Kuan-Hao Huang, Varun Iyer, Anoop Kumar, Sriram Venkatapathy, Kai-Wei Chang and Aram Galstyan", "abstract": "Syntactically controlled paraphrase generation has become an emerging research direction in recent years. Most existing approaches require annotated paraphrase pairs for training and are thus costly to extend to new domains. Unsupervised approaches, on the other hand, do not need paraphrase pairs but suffer from relatively poor performance in terms of syntactic control and quality of generated paraphrases. In this paper, we demonstrate that leveraging Abstract Meaning Representations (AMR) can greatly improve the performance of unsupervised syntactically controlled paraphrase generation.\nOur proposed model, AMR-enhanced Paraphrase Generator (AMRPG), separately encodes the AMR graph and the constituency parse of the input sentence into two disentangled semantic and syntactic embeddings. A decoder is then learned to reconstruct the input sentence from the semantic and syntactic embeddings. Our experiments show that AMRPG generates more accurate syntactically controlled paraphrases, both quantitatively and qualitatively, compared to the existing unsupervised approaches. We also demonstrate that the paraphrases generated by AMRPG can be used for data augmentation to improve the robustness of NLP models.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.602099895477295, 4.517934322357178], "id": 956, "title": "Can AMR Assist Legal and Logical Reasoning?", "authors": "Nikolaus Schrack, Ruixiang Cui, Hugo A. L\u00f3pez and Daniel Hershcovich", "abstract": "Abstract Meaning Representation (AMR) has been shown to be useful for many downstream tasks. In this work, we explore the use of AMR for legal and logical reasoning. Specifically, we investigate if AMR can help capture logical relationships on multiple choice question answering (MCQA) tasks. We propose neural architectures that utilize linearised AMR graphs in combination with pre-trained language models. While these models are not able to outperform text-only baselines, they correctly solve different instances than the text models, suggesting complementary abilities. Error analysis further reveals that AMR parsing quality is the most prominent challenge, especially regarding inputs with multiple sentences. We conduct a theoretical analysis of how logical relations are represented in AMR and conclude it might be helpful in some logical statements but not for others.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [10.569663047790527, 6.962979793548584], "id": 961, "title": "Data Selection Curriculum for Neural Machine Translation", "authors": "Tasnim Mohiuddin, Philipp Koehn, Vishrav Chaudhary, James Cross, Shruti Bhosale and Shafiq Joty", "abstract": "Neural Machine Translation (NMT) models are typically trained on heterogeneous data that are concatenated and randomly shuffled. However, not all of the training data are equally useful to the model. Curriculum training aims to present the data to the NMT models in a meaningful order. In this work, we introduce a two-stage training framework for NMT where we fine-tune a base NMT model on subsets of data, selected by both deterministic scoring using pre-trained methods and online scoring that considers prediction scores of the emerging NMT model. Through comprehensive experiments on six language pairs comprising low- and high-resource languages from WMT'21, we have shown that our curriculum strategies consistently demonstrate better quality (up to +2.2 BLEU improvement) and faster convergence (approximately 50% fewer updates).", "track": "Machine Translation", "label": 10}, {"loc": [5.836450099945068, 9.042604446411133], "id": 973, "title": "Text Editing as Imitation Game", "authors": "Ning Shi, Bin Tang, Bo Yuan, Longtao Huang, Yewen Pu, Jie Fu and Zhouhan Lin", "abstract": "Text editing, such as grammatical error correction, arises naturally from imperfect textual data. Recent works frame text editing as a multi-round sequence tagging task, where operations -- such as insertion and substitution -- are represented as a sequence of tags. While achieving good results, this encoding is limited in flexibility as all actions are bound to token-level tags. In this work, we reformulate text editing as an imitation game using behavioral cloning. Specifically, we convert conventional sequence-to-sequence data into state-to-action demonstrations, where the action space can be as flexible as needed. Instead of generating the actions one at a time, we introduce a dual decoders structure to parallel the decoding while retaining the dependencies between action tokens, coupled with trajectory augmentation to alleviate the distribution shift that imitation learning often suffers. In experiments on a suite of Arithmetic Equation benchmarks, our model consistently outperforms the autoregressive baselines in terms of performance, efficiency, and robustness. We hope our findings will shed light on future studies in reinforcement learning applying sequence-level action generation to natural language processing.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.962378025054932, 9.911242485046387], "id": 983, "title": "Seeded Hierarchical Clustering for Expert-Crafted Taxonomies", "authors": "Anish Saha, Amith Ananthram, Emily Allaway, Heng Ji and Kathleen McKeown", "abstract": "Practitioners from many disciplines (e.g., political science) use expert-crafted taxonomies to make sense of large, unlabeled corpora. In this work, we study Seeded Hierarchical Clustering (SHC): the task of automatically fitting unlabeled data to such taxonomies using a small set of labeled examples. We propose HierSeed, a novel weakly supervised algorithm for this task that uses only a small set of labeled seed examples in a computation and data efficient manner. HierSeed assigns documents to topics by weighing document density against topic hierarchical structure. It outperforms unsupervised and supervised baselines for the SHC task on three real-world datasets.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [0.697782576084137, 6.815269947052002], "id": 988, "title": "Knowledge Graph Generation From Text", "authors": "Igor Melnyk, Pierre Dognin and Payel Das", "abstract": "In this work we propose a novel end-to-end multi-stage Knowledge Graph (KG) generation system from textual inputs, separating the overall process into two stages. The graph nodes are generated first using pretrained language model, followed by a simple edge construction head, enabling efficient KG extraction from the text. For each stage we consider several architectural choices that can be used depending on the available training resources. We evaluated the model on a recent WebNLG 2020 Challenge dataset, matching the state-of-the-art performance on text-to-RDF generation task, as well as on New York Times (NYT) and a large-scale TekGen datasets, showing strong overall performance, outperforming the existing baselines. We believe that the proposed system can serve as a viable KG construction alternative to the existing linearization or sampling-based graph generation approaches.", "track": "NLP Applications", "label": 0}, {"loc": [3.8624353408813477, 7.540047645568848], "id": 995, "title": "DialogueGAT: A Graph Attention Network for Financial Risk Prediction by Modeling the Dialogues in Earnings Conference Calls", "authors": "Yunxin Sang and Yang Bao", "abstract": "Financial risk prediction is an essential task for risk management in capital markets. While traditional prediction models are built based on the hard information of numerical data, recent studies have shown that the soft information of verbal cues in earnings conference calls is significant for predicting market risk due to its less constrained fashion and direct interaction between managers and analysts. However, most existing models mainly focus on extracting useful semantic information from the textual conference call transcripts but ignore their subtle yet important information of dialogue structures. To bridge this gap, we develop a graph attention network called DialogueGAT for financial risk prediction by simultaneously modeling the speakers and their utterances in dialogues in conference calls. Different from previous studies, we propose a new method for constructing the graph of speakers and utterances in a dialogue, and design contextual attention at both speaker and utterance levels for disentangling their effects on the downstream prediction task. For model evaluation, we extend an existing dataset of conference call transcripts by adding the dialogue structure and speaker information. Empirical results on our dataset of S&P1500 companies demonstrate the superiority of our proposed model over competitive baselines from the extant literature.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [6.338695526123047, 5.31313943862915], "id": 996, "title": "Investigating Ensemble Methods for Model Robustness Improvement of Text Classifiers", "authors": "Jieyu Zhao, Xuezhi Wang, Yao Qin, Jilin Chen and Kai-Wei Chang", "abstract": "Large pre-trained language models have shown remarkable performance over the past few years. These models, however, sometimes learn superficial features from the dataset and cannot generalize to the distributions that are dissimilar to the training scenario. There have been several approaches proposed to reduce model's reliance on these bias features which can improve model robustness in the out-of-distribution setting. However, existing methods usually use a fixed low-capacity model to deal with various bias features, which ignore the learnability of those features. In this paper, we analyze a set of existing bias features and demonstrate there is no single model that works best for all the cases. We further show that by choosing an appropriate bias model, we can obtain a better robustness result than baselines with a more sophisticated model design.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.26923942565918, 9.541299819946289], "id": 1007, "title": "Adaptive Ranking-based Sample Selection for Weakly Supervised Class-imbalanced Text Classification", "authors": "Linxin Song, Jieyu Zhang, Tianxiang Yang and Masayuki Goto", "abstract": "To obtain a large amount of training labels inexpensively, researchers have recently adopted the weak supervision (WS) paradigm, which leverages labeling rules to synthesize training labels rather than using individual annotations to achieve competitive results for natural language processing (NLP) tasks. However, data imbalance is often overlooked in applying the WS paradigm, despite being a common issue in a variety of NLP tasks. To address this challenge, we propose Adaptive Ranking-based Sample Selection (ARS2), a model-agnostic framework to alleviate the data imbalance issue in the WS paradigm. Specifically, it calculates a probabilistic margin score based on the output of the current model to measure and rank the cleanliness of each data point. Then, the ranked data are sampled based on both class-wise and rule-aware ranking. In particular, the two sample strategies corresponds to our motivations: (1) to train the model with balanced data batches to reduce the data imbalance issue and (2) to exploit the expertise of each labeling rule for collecting clean samples. Experiments on four text classification datasets with four different imbalance ratios show that ARS2 outperformed the state-of-the-art imbalanced learning and WS methods, leading to a 2%-57.8% improvement on their F1-score.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.808779239654541, 3.34919810295105], "id": 1011, "title": "ComFact: A Benchmark for Linking Contextual Commonsense Knowledge", "authors": "Silin Gao, Jena D. Hwang, Saya Kanno, Hiromi Wakaki, Yuki Mitsufuji and Antoine Bosselut", "abstract": "Understanding rich narratives, such as dialogues and stories, often requires natural language processing systems to access relevant knowledge from commonsense knowledge graphs. However, these systems typically retrieve facts from KGs using simple heuristics that disregard the complex challenges of identifying situationally-relevant commonsense knowledge (e.g., contextualization, implicitness, ambiguity).\n\nIn this work, we propose the new task of commonsense fact linking, where models are given contexts and trained to identify situationally-relevant commonsense knowledge from KGs. Our novel benchmark, ComFact, contains ~293k in-context relevance annotations for commonsense triplets across four stylistically diverse dialogue and storytelling datasets. Experimental results confirm that heuristic fact linking approaches are imprecise knowledge extractors. Learned fact linking models demonstrate across-the-board performance improvements (~34.6% F1) over these heuristics. Furthermore, improved knowledge retrieval yielded average downstream improvements of 9.8% for a dialogue response generation task. However, fact linking models still significantly underperform humans, suggesting our benchmark is a promising testbed for research in commonsense augmentation of NLP systems.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [7.587749004364014, 7.9540181159973145], "id": 1014, "title": "Learning to Perform Complex Tasks through Compositional Fine-Tuning of Language Models", "authors": "Victor Bursztyn, David Demeter, Doug Downey and Larry Birnbaum", "abstract": "How to usefully encode compositional task structure has long been a core challenge in AI. Recent work in chain of thought prompting has shown that for very large neural language models (LMs), explicitly demonstrating the inferential steps involved in a target task may improve performance over end-to-end learning that focuses on the target task alone. However, chain of thought prompting has significant limitations due to its dependency on huge pretrained LMs. In this work, we present compositional fine-tuning (CFT): an approach based on explicitly decomposing a target task into component tasks, and then fine-tuning smaller LMs on a curriculum of such component tasks. We apply CFT to recommendation tasks in two domains, world travel and local dining, as well as a previously studied inferential task (sports understanding). We show that CFT outperforms end-to-end learning even with equal amounts of data, and gets consistently better as more component tasks are modeled via fine-tuning. Compared with chain of thought prompting, CFT performs at least as well using LMs only 7.4% of the size, and is moreover applicable to task domains for which data are not available during pretraining.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [6.882952690124512, 9.921767234802246], "id": 1027, "title": "Topic Taxonomy Expansion via Hierarchy-Aware Topic Phrase Generation", "authors": "Dongha Lee, Jiaming Shen, Seonghyeon Lee, Susik Yoon, Hwanjo Yu and Jiawei Han", "abstract": "Topic taxonomies display hierarchical topic structures of a text corpus and provide topical knowledge to enhance various NLP applications. To dynamically incorporate new topic information, several recent studies have tried to expand (or complete) a topic taxonomy by inserting emerging topics identified in a set of new documents. However, existing methods focus only on frequent terms in documents and the local topic-subtopic relations in a taxonomy, which leads to limited topic term coverage and fails to model the global taxonomy structure. In this work, we propose a novel framework for topic taxonomy expansion, named TopicExpan, which directly generates topic-related terms belonging to new topics. Specifically, TopicExpan leverages the hierarchical relation structure surrounding a new topic and the textual content of an input document for topic term generation. This approach encourages newly-inserted topics to further cover important but less frequent terms as well as to keep their relation consistency within the taxonomy. Experimental results on two real-world text corpora show that TopicExpan significantly outperforms other baseline methods in terms of the quality of output taxonomies.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [5.934807777404785, 6.070494174957275], "id": 1029, "title": "Language as a fingerprint: Self-supervised learning of user encodings using transformers", "authors": "Roberta Rocca and Tal Yarkoni", "abstract": "The way we talk carries information about who we are. Demographics, personality, clinical conditions, political preferences influence what we speak about and how, suggesting that many individual attributes could be inferred from adequate encodings of linguistic behavior. Conversely, conditioning text representations on author attributes has been shown to improve model performance in many NLP tasks. Previous research on individual differences and language representations has mainly focused on predicting selected attributes from text, or on conditioning text representations on such attributes for author-based contextualization. Here, we present a self-supervised approach to learning language-based user encodings using transformers. Using a large corpus of Reddit submissions, we fine-tune DistilBERT on user-based triplet loss. We show that fine-tuned models can pick up on complex linguistic signatures of users, and that they are able to infer rich information about them. Through a series of intrinsic analyses and probing tasks, we provide evidence that fine-tuning enhances models' ability to abstract generalizable user information, which yields performance advantages for user-based downstream tasks. We discuss applications in language-based assessment and contextualized and personalized NLP.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [7.51607608795166, 8.335508346557617], "id": 1032, "title": "Hyperdecoders: Instance-specific decoders for multi-task NLP", "authors": "Hamish Ivison and Matthew Peters", "abstract": "We investigate input-conditioned hypernetworks for multi-tasking in NLP, generating parameter-efficient adaptations for a decoder using a hypernetwork conditioned on the output of an encoder. This approach produces a unique decoder adaptation for every input instance, allowing the network a larger degree of flexibility than prior work that only produces one decoder adaptation per task. We apply our method to sequence classification tasks, extractive QA, and summarisation and find that it surpasses previous parameter efficient fine-tuning methods and often outperforms fully finetuning the underlying model. An analysis of the embeddings used by our hypernetwork shows that they are sensitive to output label and type, suggesting that our approach better maps from encoder representations to output labels. Our code is publicly available at https://github.com/allenai/hyperdecoders.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.489401817321777, 5.087064266204834], "id": 1038, "title": "Evaluating the Faithfulness of Importance Measures in NLP by Recursively Masking Allegedly Important Tokens and Retraining", "authors": "Andreas Madsen, Nicholas Meade, Vaibhav Adlakha and Siva Reddy", "abstract": "To explain NLP models a popular approach is to use importance measures, such as attention, which inform input tokens are important for making a prediction. However, an open question is how well these explanations accurately reflect a model's logic, a property called faithfulness. To answer this question, we propose Recursive ROAR, a new faithfulness metric. This works by recursively masking allegedly important tokens and then retraining the model. The principle is that this should result in worse model performance compared to masking random tokens. The result is a performance curve given a masking-ratio. Furthermore, we propose a summarizing metric using area-between-curves (ABC), which allows for easy comparison across papers, models, and tasks. We evaluate 4 different importance measures on 8 different datasets, using both LSTM-attention models and RoBERTa models. We find that the faithfulness of importance measures is both model-dependent and task-dependent. This conclusion contradicts previous evaluations in both computer vision and faithfulness of attention literature.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.697113513946533, 6.019443035125732], "id": 1042, "title": "Towards Explaining Subjective Ground of Individuals on Social Media", "authors": "Younghun Lee and Dan Goldwasser", "abstract": "Large-scale language models have been reducing the gap between machines and humans in understanding the real world, yet understanding an individual's theory of mind and behavior from text is far from being resolved. \n\nThis research proposes a neural model---Subjective Ground Attention---that learns subjective grounds of individuals and accounts for their judgments on situations of others posted on social media. Using simple attention modules as well as taking one's previous activities into consideration, we empirically show that our model provides human-readable explanations of an individual's subjective preference in judging social situations. We further qualitatively evaluate the explanations generated by the model and claim that our model learns an individual's subjective orientation towards abstract moral concepts.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.3674139976501465, 8.600701332092285], "id": 1045, "title": "Knowledge Injected Prompt Based Fine-tuning for Multi-label Few-shot ICD Coding", "authors": "Zhichao Yang, Shufan Wang, Bhanu Pratap Singh Rawat, Avijit Mitra and hong yu", "abstract": "Automatic International Classification of Diseases (ICD) coding aims to assign multiple ICD codes to a medical note with average length of 3,000+ tokens. This task is challenging due to a high-dimensional space of multi-label assignment (tens of thousands of ICD codes) and the long-tail challenge: only a few codes (common diseases) are frequently assigned while most codes (rare diseases) are infrequently assigned. This study addresses the long-tail challenge by adapting a prompt-based fine-tuning technique with label semantics, which has been shown to be effective under few-shot setting. To further enhance the performance in medical domain, we propose a knowledge-enhanced longformer by injecting three domain-specific knowledge: hierarchy, synonym, and abbreviation with additional pretraining using contrastive learning. Experiments on MIMIC-III-full, a benchmark dataset of code assignment, show that our proposed method outperforms previous state-of-the-art method in 14.5% in marco F1 (from 10.3 to 11.8, P<0.001). To further test our model on few-shot setting, we created a new rare diseases coding dataset, MIMIC-III-rare50, on which our model improves marco F1 from 17.1 to 30.4 and micro F1 from 17.2 to 32.6 compared to previous method.", "track": "NLP Applications", "label": 0}, {"loc": [4.294095993041992, 3.9713070392608643], "id": 1051, "title": "Do Language Models Understand Measurements?", "authors": "Sungjin Park, Seungwoo Ryu and Edward Choi", "abstract": "Recent success of pre-trained language models (PLMs) has stimulated interest in their ability to understand and work with numbers. Yet, the numerical reasoning over measurements has not been formally studied despite their importance. In this study, we show that PLMs lack the capability required for reasoning over measurements. Furthermore, we find that a language model trained on a measurement-rich corpus shows better performance on understanding measurements. We propose a simple embedding strategy to better distinguish between numbers and units, which leads to a significant improvement in the probing tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [1.8120239973068237, 8.973163604736328], "id": 1070, "title": "Reconciliation of Pre-trained Models and Prototypical Neural Networks in Few-shot Named Entity Recognition", "authors": "Youcheng Huang, Wenqiang Lei, Jie Fu and Jiancheng Lv", "abstract": "Incorporating large-scale pre-trained models with the prototypical neural networks is a de-facto paradigm in few-shot named entity recognition. Existing methods, unfortunately, are not aware of the fact that embeddings from pre-trained models contain a prominently large amount of information regarding word frequencies, biasing prototypical neural networks against learning word entities. This discrepancy constrains the two models' synergy. Thus, we propose a one-line-code normalization method to reconcile such a mismatch with empirical and theoretical grounds. Our experiments based on nine benchmark datasets show the superiority of our method over the counterpart models and are comparable to the state-of-the-art methods. In addition to the model enhancement, our work also provides an analytical viewpoint for addressing the general problems in few-shot name entity recognition or other tasks that rely on pre-trained models or prototypical neural networks.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [2.0803070068359375, 7.576641082763672], "id": 1071, "title": "HCL-TAT: A Hybrid Contrastive Learning Method for Few-shot Event Detection with Task-Adaptive Threshold", "authors": "Ruihan Zhang, Wei Wei, Xian-Ling Mao, rui fang and Dangyang Chen", "abstract": "Event detection has been suffering from constantly emerging event types with lack of sufficient data. Existing works formulate the new problem as few-shot event detection (FSED), and employ two-stage or unified models based on meta-learning to address the problem. However, these methods fall far short of expectations due to: (i) insufficient learning of discriminative representations in low-resource scenarios, and (ii) representation overlap between triggers and non-triggers. To resolve the above issues, in this paper, we propose a novel Hybrid Contrastive Learning method with a Task-Adaptive Threshold (abbreviated as HCL-TAT), which enables discriminative representation learning with a two-view contrastive loss (support-support and prototype-query), and devises an easily-adapted threshold to alleviate misidentification of triggers. Extensive experiments on the benchmark dataset FewEvent demonstrate the superiority of our method to achieve better results compared to the state-of-the-arts. All the data and codes will be available to facilitate future research.", "track": "Information Extraction", "label": 5}, {"loc": [4.149147987365723, 7.391139030456543], "id": 1092, "title": "Doc2Bot: Accessing Heterogeneous Documents via Conversational Bots", "authors": "Haomin Fu, Yeqin Zhang, Haiyang Yu, Jian Sun, Fei Huang, Luo Si, Yongbin Li and Cam Tu Nguyen", "abstract": "This paper introduces Doc2Bot, a novel dataset for building machines that help users seek information via conversations. This is of particular interest for companies and organizations that own a large number of manuals or instruction books. Despite its potential, the nature of our task poses several challenges: (1) documents contain various structures that hinder the ability of machines to comprehend, and (2) user information needs are often underspecified. Compared to prior datasets that either focus on a single structural type or overlook the role of questioning to uncover user needs, the Doc2Bot dataset is developed to target such challenges systematically. Our dataset contains over 100,000 turns based on Chinese documents from five domains, larger than any prior document-grounded dialog dataset for information seeking. We propose three tasks in Doc2Bot: (1) dialog state tracking to track user intentions, (2) dialog policy learning to plan system actions and contents, and (3) response generation which generates responses based on the outputs of the dialog policy. Baseline methods based on the latest deep learning models are presented, indicating that our proposed tasks are challenging and worthy of further research.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.7070924043655396, 9.068937301635742], "id": 1101, "title": "DualNER: A Dual-Teaching framework for Zero-shot Cross-lingual Named Entity Recognition", "authors": "Jiali Zeng, Yufan Jiang, Yongjing Yin, Xu Wang, Binghuai Lin and Yunbo Cao", "abstract": "We present DualNER, a simple and effective framework to make full use of both annotated source language corpus and unlabeled target language text for zero-shot cross-lingual named entity recognition (NER). In particular, we combine two complementary learning paradigms of NER, i.e., sequence labeling and span prediction, into a unified multi-task framework. After obtaining a sufficient NER model trained on the source data, we further train it on the target data in a {\\it dual-teaching} manner, in which the pseudo-labels for one task are constructed from the prediction of the other task. Moreover, based on the span prediction, an entity-aware regularization is proposed to enhance the intrinsic cross-lingual alignment between the same entities in different languages. Experiments and analysis demonstrate the effectiveness of our DualNER.", "track": "Multilinguality", "label": 13}, {"loc": [2.507970094680786, 4.832772731781006], "id": 1108, "title": "Knowledge-augmented Self-training of A Question Rewriter for Conversational Knowledge Base Question Answering", "authors": "Xirui Ke, Jing Zhang, Xin Lv, Yiqi Xu, Shulin Cao, Cuiping Li, Hong Chen and Juanzi Li", "abstract": "The recent rise of conversational applications such as online customer service systems and intelligent personal assistants has promoted the development of conversational knowledge base question answering (ConvKBQA). Different from the traditional single-turn KBQA, ConvKBQA usually explores multi-turn questions around a topic, where ellipsis and coreference pose great challenges to the single-turn KBQA systems which require self-contained questions. In this paper, we propose a rewrite-and-reason framework to first produce a full-fledged rewritten question based on the conversation history and then reason the answer by existing single-turn KBQA models. To overcome the absence of the rewritten supervision signals, we introduce a knowledge-augmented self-training mechanism to transfer the question rewriter from another dataset to adapt to the current knowledge base. Our question rewriter is decoupled from the subsequent QA process, which makes it easy to be united with either retrieval-based or semantic parsing-based KBQA models. Experiment results demonstrate the effectiveness of our method and a new state-of-the-art result is achieved. The code and dataset are available online now.", "track": "Question Answering", "label": 11}, {"loc": [3.6973989009857178, 9.376612663269043], "id": 1134, "title": "Extractive Summarization of Legal Decisions using Multi-task Learning and Maximal Marginal Relevance", "authors": "Abhishek Agarwal, Shanshan Xu and Matthias Grabmair", "abstract": "Summarizing legal decisions requires the expertise of law practitioners, which is both time- and cost-intensive. This paper presents techniques for extractive summarization of legal decisions in a low-resource setting using limited expert annotated data. We test a set of models that locate relevant content using a sequential model and tackle redundancy by leveraging maximal marginal relevance to compose summaries. We also demonstrate an implicit approach to help train our proposed models generate more informative summaries. Our multi-task learning model variant leverages rhetorical role identification as an auxiliary task to further improve the summarizer. We perform extensive experiments on datasets containing legal decisions from the US Board of Veterans' Appeals and conduct quantitative and expert-ranked evaluations of our models. Our results show that the proposed approaches can achieve ROUGE scores vis-\u00e0-vis expert extracted summaries that match those achieved by inter-annotator comparison.", "track": "Ethic Concerns:Summarization", "label": 14}, {"loc": [4.967792510986328, 12.052706718444824], "id": 1153, "title": "MovieUN: A Dataset for Movie Understanding and Narrating", "authors": "QI ZHANG, Zihao Yue, Anwen Hu, Ziheng Wang and Qin Jin", "abstract": "Automatic movie narration generation and narration grounding are very important to provide a true movie experience for the blind and visually impaired. To tell the movie story well, it is necessary to mention plot-related details (such as character names) and keep the narrations in a plot coherent. Taking these two points into consideration, we construct a Chinese large-scale video benchmark from 101 movies for Movie Understanding and Narrating (MovieUN) to support the Movie Clip Narrating (MCN) task and Temporal Narration Grounding (TNG) task. We split movies in MovieUN into movie clips according to plots, and pair them with corresponding narrations provided by the movie narrators. Ultimately, the TNG task involves 3,253 long video clips totaling 179 hours. The MCN task contains 33,060 video clips totaling 105 hours. We benchmark state-of-the-art video captioning models and temporal grounding models in MCN and TNG tasks, respectively. Furthermore, to accurately comprehend plots of different characters, we propose methods to incorporate portraits of actors as external knowledge in both tasks. The experiment results demonstrate the effectiveness of our proposed methods. The dataset and codes are released at https://github.com/yuezih/MovieUN.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.902313709259033, 8.507898330688477], "id": 1154, "title": "ASDOT: Any-Shot Data-to-Text Generation with Pretrained Language Models", "authors": "Jiannan Xiang, Zhengzhong Liu, Yucheng Zhou, Eric P. Xing and Zhiting Hu", "abstract": "Data-to-text generation is challenging due to the great variety of the input data in terms of domains (e.g., finance vs sports) or schemata (e.g., diverse predicates). Recent end-to-end neural methods thus require substantial training examples to learn to disambiguate and describe the data. Yet, real-world data-to-text problems often suffer from various data-scarce issues: one may have access to only a handful of or no training examples, and/or have to rely on examples in a different domain or schema. To fill this gap, we propose Any-Shot Data-to-Text (ASDOT), a new approach flexibly applicable to diverse settings by making efficient use of any given (or no) examples. ASDOT consists of two steps, data disambiguation and sentence fusion, both of which are amenable to be solved with off-the-shelf pretrained language models (LMs) with optional finetuning. In the data disambiguation stage, we employ the prompted GPT-3 model to understand possibly ambiguous triples from the input data and convert each into a short sentence with reduced ambiguity. The sentence fusion stage then uses an LM like T5 to fuse all the resulting sentences into a coherent paragraph as the final description. We evaluate extensively on various datasets in different scenarios, including the zero-/few-/full-shot settings, and generalization to unseen predicates and out-of-domain data. Experimental results show that ASDOT consistently achieves significant improvement over baselines, e.g., a 30.81 BLEU gain on the DART dataset under the zero-shot setting.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.823212623596191, 9.343153953552246], "id": 1157, "title": "FCGEC: Fine-Grained Corpus for Chinese Grammatical Error Correction", "authors": "Lvxiaowei Xu, Jianwang Wu, Jiawei Peng, Jiayu Fu and Ming Cai", "abstract": "Grammatical Error Correction (GEC) has been broadly applied in automatic correction and proofreading system recently. However, it is still immature in Chinese GEC due to limited high-quality data from native speakers in terms of category and scale. In this paper, we present FCGEC, a fine-grained corpus to detect, identify and correct the grammatical errors. FCGEC is a human-annotated corpus with multiple references, consisting of 41,340 sentences collected mainly from multi-choice questions in public school Chinese examinations. Furthermore, we propose a Switch-Tagger-Generator (STG) baseline model to correct the grammatical errors in low-resource settings. Compared to other GEC benchmark models, experimental results illustrate that STG outperforms them on our FCGEC. However, there exists a significant gap between benchmark models and humans that encourages future models to bridge it.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.420965671539307, 8.320318222045898], "id": 1160, "title": "Audience-Centric Natural Language Generation via Style Infusion", "authors": "Samraj Moorjani, Adit Krishnan, Hari Sundaram, Ewa Maslowska and Aravind Sankar", "abstract": "Adopting contextually appropriate, audience-tailored linguistic styles is critical to the success of user-centric language generation systems (e.g., chatbots, computer-aided writing, dialog systems). While existing approaches demonstrate text style transfer (TST) with large volumes of parallel or non-parallel data, we argue that grounding style on audience-independent external factors is innately limiting for two reasons. First, it is difficult to collect large volumes of audience-specific stylistic data. Second, some stylistic objectives (e.g., persuasiveness, memorability, empathy) are hard to define without audience feedback. \n\nIn this paper, we propose the novel task of style infusion - infusing the stylistic preferences of audiences in pretrained language generation models. Since humans are better at pairwise comparisons than direct scoring - i.e., is Sample-A more persuasive/polite/empathic than Sample-B - we leverage limited pairwise human judgments to bootstrap a style analysis model and augment our seed set of judgments. We then infuse the learned textual style in a GPT-2 based text generator while balancing fluency and style adoption. With quantitative and qualitative assessments, we show that our infusion approach can generate compelling stylized examples with generic text prompts. We make the anonymized code and data accessible.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.0762858390808105, 8.105022430419922], "id": 1163, "title": "DocFin: Multimodal Financial Prediction and Bias Mitigation using Semi-structured Documents", "authors": "Puneet Mathur, Mihir Goyal, Ramit Sawhney, Ritik Mathur, Jochen L. Leidner, Franck Dernoncourt and Dinesh Manocha", "abstract": "Financial prediction is complex due to the stochastic nature of the stock market. Semi-structured financial documents present comprehensive financial data in tabular formats, such as earnings, profit-loss statements, and balance sheets, and can often contain rich technical analysis along with a textual discussion of corporate history, and management analysis, compliance, and risks. Existing research focuses on the textual and audio modalities of financial disclosures from company conference calls to forecast stock volatility and price movement, but ignores the rich tabular data available in financial reports. Moreover, the economic realm is still plagued with a severe under-representation of various communities spanning diverse demographics, gender, and native speakers. In this work, we show that combining tabular data from financial semi-structured documents with text transcripts and audio recordings not only improves stock volatility and price movement prediction by 5-12% but also reduces gender bias caused due to audio-based neural networks by over 30%.", "track": "NLP Applications", "label": 0}, {"loc": [0.7808869481086731, 8.056486129760742], "id": 1168, "title": "Not Just Plain Text! Fuel Document-Level Relation Extraction with Explicit Syntax Refinement and Subsentence Modeling", "authors": "Zhichao Duan, Xiuxing Li, Zhenyu Li, Zhuo Wang and Jianyong Wang", "abstract": "Document-level relation extraction (DocRE) aims to identify semantic labels among entities within a single document. One major challenge of DocRE is to dig decisive details regarding a specific entity pair from long text. However, in many cases, only a fraction of text carries required information, even in the manually labeled supporting evidence. To better capture and exploit instructive information, we propose a novel expLicit syntAx Refinement and Subsentence mOdeliNg based framework (LARSON). By introducing extra syntactic information, LARSON can model subsentences of arbitrary granularity and efficiently screen instructive ones. Moreover, we incorporate refined syntax into text representations which further improves the performance of LARSON. Experimental results on three benchmark datasets (DocRED, CDR, and GDA) demonstrate that LARSON significantly outperforms existing methods.", "track": "Information Extraction", "label": 5}, {"loc": [9.822071075439453, 7.907843589782715], "id": 1171, "title": "Self-supervised Rewiring of Pre-trained Speech Encoders: \\\\Towards Faster Fine-tuning with Less Labels in Speech Processing", "authors": "Hao Yang, Jinming Zhao, Gholamreza Haffari and Ehsan Shareghi", "abstract": "Pre-trained speech Transformers have facilitated great success across various speech processing tasks. However, fine-tuning these encoders for downstream tasks require sufficiently large training data to converge or to achieve state-of-the-art. In text domain this has been partly attributed to sub-optimality of the representation space in pre-trained Transformers. In this work, we take a sober look into pre-trained speech encoders and rewire their representation space without requiring any task-specific labels. Our method utilises neutrally synthesised version of audio inputs along with frame masking to construct positive pairs for contrastive self-supervised learning. When used for augmenting the wav2vec 2 encoder, we observe consistent improvement of isotropy in the representation space. Our experiments on 6 speech processing tasks, exhibit a significant convergence speedup during task fine-tuning as well as consistent task improvement, specially in low-resource settings.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.16253662109375, 7.8309807777404785], "id": 1177, "title": "RedApt: An Adaptor for wav2vec 2 Encoding \\\\ Faster and Smaller Speech Translation without Quality Compromise", "authors": "Jinming Zhao, Hao Yang, Gholamreza Haffari and Ehsan Shareghi", "abstract": "Pre-trained speech Transformers in speech translation (ST) have facilitated state-of-the-art (SotA) results; yet, using such encoders is computationally expensive. To improve this, we present a novel Reducer Adaptor block, RedApt, that could be seamlessly integrated within any Transformer-based speech encoding architecture. Integrating the pretrained wav2vec 2 speech encoder with RedAptbrings 41% speedup, 33% memory reduction with 24% fewer FLOPs at inference. To our positive surprise, our ST model with RedApt outperforms the SotA architecture by an average of 0.68 BLEU score on 8 language pairs from Must-C.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.864821434020996, 6.833907127380371], "id": 1181, "title": "How sensitive are translation systems to extra contexts? Mitigating gender bias in Neural Machine Translation models through relevant contexts.", "authors": "Shanya Sharma, Manan Dey and Koustuv Sinha", "abstract": "Neural Machine Translation systems built on top of Transformer-based architectures are routinely improving the state-of-the-art in translation quality according to word-overlap metrics. However, a growing number of studies also highlight the inherent gender bias that these models incorporate during training, which reflects poorly in their translations. In this work, we investigate whether these models can be instructed to fix their bias during inference using targeted, guided instructions as contexts. By translating relevant contextual sentences during inference along with the input, we observe large improvements in reducing the gender bias in translations, across three popular test suites (WinoMT, BUG, SimpleGen). We further propose a novel metric to assess several large pre-trained models (OPUS-MT, M2M-100) on their sensitivity towards using contexts during translation to correct their biases. Our approach requires no fine-tuning, and thus can be used easily in production systems to de-bias translations from stereotypical gender-occupation bias. We hope our method, along with our metric, can be used to build better, bias-free translation systems.", "track": "Machine Translation", "label": 10}, {"loc": [2.508455991744995, 8.669610977172852], "id": 1191, "title": "P$\\text{M}^2\\text{F}^2$N: Patient Multi-view Multi-modal Feature Fusion Networks for Clinical Outcome Prediction", "authors": "Ying Zhang, Baohang Zhou, Kehui Song, Xuhui Sui, Guoqing Zhao, Ning Jiang and Xiaojie Yuan", "abstract": "Clinical outcome prediction is critical to the condition prediction of patients and management of hospital capacities. There are two kinds of medical data, including time series signals recorded by various devices and clinical notes in electronic health records (EHR), which are used for two common prediction targets: mortality and length of stay. Traditional methods focused on utilizing time series data but ignored clinical notes. With the development of deep learning, natural language processing (NLP) and multi-modal learning methods are exploited to jointly model the time series and clinical notes with different modals. However, the existing methods failed to fuse the multi-modal features of patients from different views. Therefore, we propose the patient multi-view multi-modal feature fusion networks for clinical outcome prediction. Firstly, from patient inner view, we propose to utilize the co-attention module to enhance the fine-grained feature interaction between time series and clinical notes from each patient. Secondly, the patient outer view is the correlation between patients, which can be reflected by the structural knowledge in clinical notes. We exploit the structural information extracted from clinical notes to construct the patient correlation graph, and fuse patients' multi-modal features by graph neural networks (GNN). The experimental results on MIMIC-III benchmark demonstrate the superiority of our method.", "track": "NLP Applications", "label": 0}, {"loc": [3.7539517879486084, 9.406847953796387], "id": 1202, "title": "Long Text and Multi-Table Summarization: Dataset and Method", "authors": "Shuaiqi LIU, Jiannong Cao, Ruosong Yang and Zhiyuan Wen", "abstract": "Automatic document summarization aims to produce a concise summary covering the input document's salient information. Within a report document, the salient information can be scattered in the textual and non-textual content. However, existing document summarization datasets and methods usually focus on the text and filter out the non-textual content. Missing tabular data can limit produced summaries' informativeness, especially when summaries require covering quantitative descriptions of critical metrics in tables. Existing datasets and methods cannot meet the requirements of summarizing long text and multiple tables in each report. To deal with the scarcity of available data, we propose FINDSum, the first large-scale dataset for long text and multi-table summarization. Built on 21,125 annual reports from 3,794 companies, it has two subsets for summarizing each company's results of operations and liquidity. To summarize the long text and dozens of tables in each report, we present three types of summarization methods. Besides, we propose a set of evaluation metrics to assess the usage of numerical information in produced summaries. Dataset analyses and experimental results indicate the importance of jointly considering input textual and tabular data when summarizing report documents.", "track": "Summarization", "label": 14}, {"loc": [1.9611045122146606, 4.037552833557129], "id": 1210, "title": "MatRank: Text Re-ranking by Latent Preference Matrix", "authors": "jinwen luo, Jiuding Yang, Weidong Guo, Chenglin Li, Di Niu and Yu Xu", "abstract": "Text ranking plays a key role in providing content that best answers user queries. It is usually divided into two sub-tasks to perform efficient information retrieval given a query: text retrieval and text re-ranking. Recent research on pretrained language models (PLM) has demonstrated efficiency and gain on both sub-tasks. However, while existing methods have benefited from pre-trained language models and achieved high recall rates on passage retrieval, the ranking performance still demands further improvement. In this paper, we propose MatRank, which learns to re-rank the text retrieved for a given query by learning to predict the most relevant passage based on a latent preference matrix. Specifically, MatRank uses a PLM to generate an asymmetric latent matrix of relative preference scores between all pairs of retrieved passages. Then, the latent matrix is aggregated row-wise and column-wise to obtain global preferences and predictions of the most relevant passage in two of these directions, respectively. We conduct extensive experiments on MS MACRO, WikiAQ, and SemEval datasets. Experimental results show that MatRank has achieved new state-of-the-art results on these datasets, outperforming all prior methods on ranking performance metrics.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [0.7015656232833862, 6.488614559173584], "id": 1214, "title": "Can Language Models Serve as Temporal Knowledge Bases?", "authors": "Ruilin Zhao, Feng Zhao, Guandong Xu, Sixiao Zhang and Hai Jin", "abstract": "Recent progress regarding the use of language models (LMs) as knowledge bases (KBs) has shown that language models can act as structured knowledge bases for storing relational facts. However, most existing works only considered the LM-as-KB paradigm in a static setting, which ignores the analysis of temporal dynamics of world knowledge. Furthermore, a basic function of KBs, i.e., the ability to store conflicting information (i.e., 1-N, N-1, and N-M relations), is underexplored. In this paper, we formulate two practical requirements for treating LMs as temporal KBs: (i) The capacity to store temporally-scoped knowledge that contains conflicting information and (ii) the ability to use stored knowledge for temporally-scoped knowledge queries. We introduce a new dataset called LAMA-TK which is aimed at probing temporally-scoped knowledge, and investigate the two above requirements to explore the LM-as-KB paradigm in the temporal domain. On the one hand, experiments show that LMs can memorize millions of temporally-scoped facts with relatively high accuracy and transfer stored knowledge to temporal knowledge queries, thereby expanding the LM-as-KB paradigm to the temporal domain. On the other hand, we show that memorizing conflicting information, which has been neglected by previous works, is still challenging for LMs and hinders the memorization of other unrelated one-to-one relationships.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [0.9554713368415833, 6.856616973876953], "id": 1217, "title": "UNTER: A Unified Knowledge Interface for Enhancing Pre-trained Language Models", "authors": "Deming Ye, Yankai Lin, Zhengyan Zhang and Maosong Sun", "abstract": "Recent research demonstrates that external knowledge injection can advance pre-trained language models (PLMs) in a variety of downstream NLP tasks. However, existing knowledge injection methods are either applicable to structured knowledge or unstructured knowledge, lacking a unified usage. \nIn this paper, we propose a UNified knowledge inTERface, UNTER to provide a unified perspective to exploit both structured knowledge and unstructured knowledge. In UNTER, we adopt the decoder as a unified knowledge interface, aligning span representations obtained from the encoder with their corresponding knowledge. This approach enables the encoder to uniformly invoke span-related knowledge from its parameters for downstream applications. Experimental results show that, with both forms of knowledge injected, UNTERgains notable improvements on a series of knowledge-driven NLP tasks, including entity typing, named entity recognition and relation extraction, especially in low-resource scenarios. We will release all pre-trained models and code to facilitate the research of unified knowledge injection.", "track": "Information Extraction", "label": 5}, {"loc": [8.181865692138672, 3.0802934169769287], "id": 1220, "title": "Are Large Pre-Trained Language Models Leaking Your Personal Information?", "authors": "Jie Huang, Hanyin Shao and Kevin Chang", "abstract": "Are Large Pre-Trained Language Models Leaking Your Personal Information? In this paper, we analyze whether Pre-Trained Language Models (PLMs) are prone to leaking personal information. Specifically, we query PLMs for email addresses with contexts of the email address or prompts containing the owner's name. We find that PLMs do leak personal information due to memorization. However, since the models are weak at association, the risk of specific personal information being extracted by attackers is low. We hope this work could help the community to better understand the privacy risk of PLMs and bring new insights to make PLMs safe.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [0.6181617379188538, 6.812732696533203], "id": 1225, "title": "Self-Distillation with Meta Learning for Knowledge Graph Completion", "authors": "Yunshui Li, Junhao Liu, Min Yang and Chengming Li", "abstract": "In this paper, we propose a self-distillation framework with meta learning (MetaSD) for knowledge graph completion with dynamic pruning, which aims to learn compressed graph embeddings and tackle the long-tail samples. Specifically, we first propose a dynamic pruning technique to obtain a small pruned model from a large source model, where the pruning mask of the pruned model could be updated adaptively per epoch after the model weights are updated. The pruned model is supposed to be more sensitive to difficult-to-memorize samples (e.g., long-tail samples) than the source model. Then, we propose a one-step meta self-distillation method for distilling comprehensive knowledge from the source model to the pruned model, where the two models co-evolve in a dynamic manner during training. In particular, we exploit the performance of the pruned model, which is trained alongside the source model in one iteration, to improve the source model's knowledge transfer ability for the next iteration via meta learning. Extensive experiments show that MetaSD achieves competitive performance compared to strong baselines, while being 10x smaller than baselines.", "track": "NLP Applications", "label": 0}, {"loc": [1.3681929111480713, 4.9261345863342285], "id": 1228, "title": "CQR-SQL: Conversational Question Reformulation Enhanced Context-Dependent Text-to-SQL Parsers", "authors": "Dongling Xiao, LinZheng Chai, Qian-Wen Zhang, Zhao Yan, Zhoujun Li and Yunbo Cao", "abstract": "Context-dependent text-to-SQL is the task of translating multi-turn questions into database-related SQL queries. Existing methods typically focus on making full use of history context or previously predicted SQL for currently SQL parsing, while neglecting to explicitly comprehend the schema and conversational dependency, such as co-reference, ellipsis and user focus change. In this paper, we propose CQR-SQL, which uses auxiliary Conversational Question Reformulation (CQR) learning to explicitly exploit schema and decouple contextual dependency for multi-turn SQL parsing. Specifically, we first present a schema enhanced recursive CQR method to produce domain-relevant self-contained questions. Secondly, we train CQR-SQL models to map the semantics of multi-turn questions and auxiliary self-contained questions into the same latent space through schema grounding consistency task and tree-structured SQL parsing consistency task, which enhances the abilities of SQL parsing by adequately contextual understanding. At the time of writing, our CQR-SQL achieves new state-of-the-art results on two context-dependent text-to-SQL benchmarks SParC and CoSQL.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.486868858337402, 5.488314628601074], "id": 1236, "title": "Assisting the Human Fact-Checkers: Detecting All Previously Fact-Checked Claims in a Document", "authors": "Shaden Shaar, Nikola Georgiev, Firoj Alam, Giovanni Da San Martino, Aisha Mohamed and Preslav Nakov", "abstract": "Given the recent proliferation of false claims online, there has been a lot of manual fact-checking effort. As this is very time-consuming, human fact-checkers can benefit from tools that can support them and make them more efficient. Here, we focus on building a system that could provide such support. Given an input document, it aims to detect all sentences that contain a claim that can be verified by some previously fact-checked claims (from a given database). The output is a re-ranked list of the document sentences, so that those that can be verified are ranked as high as possible, together with corresponding evidence. Unlike previous work, which has looked into claim retrieval, here we take a document-level perspective. We create a new manually annotated dataset for the task, and we propose suitable evaluation measures. We further experiment with a learning-to-rank approach, achieving sizable performance gains over several strong baselines. Our analysis demonstrates the importance of modeling text similarity and stance, while also taking into account the veracity of the retrieved previously fact-checked claims. We believe that this research would be of interest to fact-checkers, journalists, media, and regulatory authorities.", "track": "NLP Applications", "label": 0}, {"loc": [6.285833358764648, 5.764007091522217], "id": 1260, "title": "No Word Embedding Model Is Perfect: Evaluating the Representation Accuracy for Social Bias in the Media", "authors": "Maximilian Splieth\u00f6ver, Maximilian Keiff and Henning Wachsmuth", "abstract": "News articles both shape and reflect public opinion across the political spectrum. Analyzing them for social bias can thus provide valuable insights, such as prevailing stereotypes in society and the media, which are often adopted by NLP models trained on respective data. Recent work has relied on word embedding bias measures, such as WEAT. However, several representation issues of embeddings can harm the measures' accuracy, including low-resource settings and token frequency differences. In this work, we study what kind of embedding algorithm serves best to accurately measure types of social bias known to exist in US online news articles. To cover the whole spectrum of political bias in the US, we collect 500k articles and review psychology literature with respect to expected social bias. We then quantify social bias using WEAT along with embedding algorithms that account for the aforementioned issues. We compare how models trained with the algorithms on news articles represent the expected social bias. Our results suggest that the standard way to quantify bias does not align well with knowledge from psychology. While the proposed algorithms reduce the~gap, they still do not fully match the literature.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.740279674530029, 3.6383464336395264], "id": 1275, "title": "Scientific and Creative Analogies in Pretrained Language Models", "authors": "Tamara Czinczoll, Helen Yannakoudakis, Pushkar Mishra and Ekaterina Shutova", "abstract": "This paper examines the encoding of analogy in large-scale pretrained language models, such as BERT and GPT-2. Existing analogy datasets typically focus on a limited set of analogical relations, with a high similarity of the two domains between which the analogy holds. As a more realistic setup, we introduce the Scientific and Creative Analogy dataset (SCAN), a novel analogy dataset containing systematic mappings of multiple attributes and relational structures across dissimilar domains. Using this dataset, we test the analogical reasoning capabilities of several widely-used pretrained language models (LMs). We find that state-of-the-art LMs achieve low performance on these complex analogy tasks, highlighting the challenges still posed by analogy understanding.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [9.031679153442383, 6.33148717880249], "id": 1303, "title": "Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages", "authors": "Kevin Heffernan, Onur \u00c7elebi and Holger Schwenk", "abstract": "Scaling multilingual representation learning beyond the hundred most frequent languages is challenging, in particular to cover the long tail of low-resource languages. We move away from the popular one-for-all multilingual models and focus on training multiple language (family) specific representations, but most prominently enable all languages to still be encoded in the same representational space. We focus on teacher-student training, allowing all encoders to be mutually compatible for bitext mining, and enabling fast learning of new languages. We also combine supervised and self-supervised training, allowing encoders to take advantage of monolingual training data.\n\nOur approach significantly outperforms the original LASER encoder. We study very low-resource languages and handle 44 African languages, many of which are not covered by any other model. For these languages, we train sentence encoders and mine bitexts. Adding these mined bitexts yielded an improvement of 3.8 BLEU for NMT into English.", "track": "Machine Translation", "label": 10}, {"loc": [1.3454856872558594, 4.9491095542907715], "id": 1309, "title": "Towards Generalizable and Robust Text-to-SQL Parsing", "authors": "Chang Gao, Bowen Li, Wenxuan Zhang, Wai Lam, Binhua Li, Fei Huang, Luo Si and Yongbin Li", "abstract": "Text-to-SQL parsing tackles the problem of mapping natural language questions to executable SQL queries. In practice, text-to-SQL parsers often encounter various challenging scenarios, requiring them to be generalizable and robust. While most existing work addresses a particular generalization or robustness challenge, we aim to study it in a more comprehensive manner. In specific, we believe that text-to-SQL parsers should be (1) generalizable at three levels of generalization, namely i.i.d., zero-shot, and compositional, and (2) robust against input perturbations. To enhance these capabilities of the parser, we propose a novel TKK framework consisting of Task decomposition, Knowledge acquisition, and Knowledge composition to learn text-to-SQL parsing in stages. By dividing the learning process into multiple stages, our framework improves the parser's ability to acquire general SQL knowledge instead of capturing spurious patterns, making it more generalizable and robust. Experimental results under various generalization and robustness settings show that our framework is effective in all scenarios and achieves state-of-the-art performance on the Spider, SParC, and CoSQL datasets.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [6.170919418334961, 8.519989967346191], "id": 1316, "title": "EdiT5: Semi-Autoregressive Text Editing with T5 Warm-Start", "authors": "Jonathan Mallinson, Jakub Adamek, Eric Malmi and Aliaksei Severyn", "abstract": "We present EdiT5 - a novel semi-autoregressive text-editing approach designed to combine the strengths of non-autoregressive text-editing and autoregressive decoding. EdiT5 is faster at inference times than conventional sequence-to-sequence (seq2seq) models, while being capable of modeling flexible input-output transformations.\n\nThis is achieved by decomposing the generation process into three sub-tasks: (1) tagging to decide on the subset of input tokens to be preserved in the output, (2) re-ordering to define their order in the output text, and (3) insertion to infill the missing tokens that are not present in the input. The tagging and re-ordering steps, which are responsible for generating the largest portion of the output, are non-autoregressive, while the insertion uses an autoregressive decoder.\n\nDepending on the task, EdiT5 requires significantly fewer autoregressive steps demonstrating speedups of up to 25x when compared to classic seq2seq models. Quality-wise, EdiT5 is initialized with a pre-trained T5 checkpoint yielding comparable performance to T5 in high-resource settings and clearly outperforms it on low-resource settings when evaluated on three NLG tasks: Sentence Fusion, Grammatical Error Correction, and Decontextualization.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.792313098907471, 6.8947367668151855], "id": 1318, "title": "A Critical Reflection and Forward Perspective on Empathy and Natural Language Processing", "authors": "Allison Claire Lahnala, Charles Welch, David Jurgens and Lucie Flek", "abstract": "We review the state of research on empathy in natural language processing and identify the following issues: (1) empathy definitions are absent or abstract, which (2) leads to low construct validity and reproducibility. Moreover, (3) emotional empathy is overemphasized, skewing our focus to a narrow subset of simplified tasks. We believe these issues hinder research progress and argue that current directions will benefit from a clear conceptualization that includes operationalizing cognitive empathy components. Our main objectives are to provide insight and guidance on empathy conceptualization for NLP research objectives and to encourage researchers to pursue the overlooked opportunities in this area, highly relevant, e.g., for clinical and educational sectors.", "track": "Theme Track", "label": 18}, {"loc": [2.9541919231414795, 6.095975399017334], "id": 1327, "title": "Social-aware Sparse Attention Network for Session-based Social Recommendation", "authors": "Kai Ouyang, Xianghong Xu, Chen Tang, Wang Chen and Haitao Zheng", "abstract": "Session-based Social Recommendation (SSR) aims to use users' social networks and historical sessions to provide more personalized recommendations for the current session.\nUnfortunately, existing SSR methods have two limitations.\nFirst, they do not screen users' useless social relationships and noisy irrelevant interactions.\nHowever, user preferences are mainly affected by several close friends and key interactions.\nSecond, when modeling the current session, they do not take full advantage of user preference information.\nTo tackle these issues, we propose a novel Social-aware Sparse Attention Network for SSR, abbreviated as SSAN.\nIt mainly consists of the Heterogeneous Graph Embedding (HGE) module and the Social-aware Encoder-decoder Network (SEN) module.\nIn the HGE module, we adopt a modified heterogeneous graph neural network, which focuses more on close friends and key historical interactions, to enhance user/item representations. \nIn the SEN module, we use the user representation as a bridge between the Encoder and Decoder to incorporate user preferences when modeling the current session.\nExtensive experiments on two benchmark datasets demonstrate the superiority of SSAN over the state-of-the-art models.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.779088973999023, 8.29272747039795], "id": 1329, "title": "SparseAdapter: An Easy Approach for Improving the Parameter-Efficiency of Adapters", "authors": "Shwai He, Liang Ding, DaiZe Dong, miao zhang and Dacheng Tao", "abstract": "Adapter Tuning, which freezes the pretrained language models (PLMs) and only fine-tunes a few extra modules, becomes an appealing efficient alternative to the full model fine-tuning. Although computationally efficient, the recent Adapters often increase parameters (e.g. bottleneck dimension) for matching the performance of full model fine-tuning, which we argue goes against their original intention. In this work, we re-examine the parameter-efficiency of Adapter through the lens of network pruning (we name such plug-in concept as SparseAdapter) and find that SparseAdapter can achieve comparable or better performance than standard Adapters when the sparse ratio reaches up to 80%. Based on our findings, we introduce an easy but effective setting \"Large-Sparse'' to improve the model capacity of Adapters under the same parameter budget. Experiments on five competitive Adapters upon three advanced PLMs show that with proper sparse method (e.g. SNIP) and ratio (e.g. 40%) SparseAdapter can consistently outperform their corresponding counterpart. Encouragingly, with the Large-Sparse setting, we can obtain further appealing gains, even outperforming the full fine-tuning by a large margin.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.370123386383057, 4.040160655975342], "id": 1358, "title": "Measurement Extraction with Natural Language Processing: A Review", "authors": "Jan G\u00f6pfert, Patrick Kuckertz, Jann M. Weinand, Leander Kotzur and Detlef Stolten", "abstract": "Quantitative data is important in many domains. Information extraction methods draw structured data from documents. However, the extraction of quantities and their contexts has received little attention in the history of information extraction. In this review, an overview of prior work on measurement extraction is presented. We describe different approaches to measurement extraction and outline the challenges posed by this task. The review concludes with an outline of potential future research. Research strains in measurement extraction tend to be isolated and lack a common terminology. Improvements in numerical reasoning, more extensive datasets, and the consideration of wider contexts may lead to significant improvements in measurement extraction.", "track": "Information Extraction", "label": 5}, {"loc": [3.625286817550659, 9.78835678100586], "id": 1380, "title": "Summarizing Procedural Text: Data and Approach", "authors": "Shen Gao, Haotong Zhang, Xiuying Chen, Rui Yan and Dongyan Zhao", "abstract": "Procedural text is a widely used genre that contains many steps of instructions of how to cook a dish or how to conduct a chemical experiment and analyze the procedural text has become a popular task in the NLP field. Since the procedural text can be very long and contains many details, summarizing the whole procedural text or giving an overview for each complicated procedure step can save time for readers and help them to capture the core information in the text. In this paper, we propose the procedural text summarization task with two summarization granularity: step-view and global-view, which summarizes each step in the procedural text separately or gives an overall summary for all steps respectively. To tackle this task, we propose an Entity-State Graph-based Summarizer (ESGS) which is based on state-of-the-art entity state tracking methods and constructs a heterogeneous graph to aggregate contextual information for each procedure. In order to help the summarization model focus on the salient entities, we propose to use the contextualized procedure graph representation to predict the salient entities. Experiments conducted on two datasets verify the effectiveness of our proposed model. Our code and datasets will be released on https://github.com/gsh199449/procedural-summ.", "track": "Summarization", "label": 14}, {"loc": [8.145788192749023, 8.720210075378418], "id": 1387, "title": "Snapshot-Guided Domain Adaptation for ELECTRA", "authors": "Daixuan Cheng, Shaohan Huang, Jianfeng Liu, Yuefeng Zhan, Hao Sun, Furu Wei, Denvy Deng and Qi Zhang", "abstract": "Discriminative pre-trained language models, such as ELECTRA, have achieved promising performances in a variety of general tasks. However, these generic pre-trained models struggle to capture domain-specific knowledge of domain-related tasks. In this work, we propose a novel domain-adaptation method for ELECTRA, which can dynamically select domain-specific tokens and guide the discriminator to emphasize them, without introducing new training parameters. We show that by re-weighting the losses of domain-specific tokens, ELECTRA can be effectively adapted to different domains. The experimental results in both computer science and biomedical domains show that the proposed method can achieve state-of-the-art results on the domain-related tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.62643575668335, 8.206815719604492], "id": 1389, "title": "Exploiting Labeled and Unlabeled Data via Transformer Fine-tuning for Peer-Review Score Prediction", "authors": "Panitan Muangkammuen, Fumiyo Fukumoto, Jiyi Li and Yoshimi Suzuki", "abstract": "Automatic Peer-review Aspect Score Prediction (PASP) of academic papers can be a helpful assistant tool for both reviewers and authors. Most existing works on PASP utilize supervised learning techniques. However, the limited number of peer-review data deteriorates the performance of PASP. This paper presents a novel semi-supervised learning (SSL) method that incorporates the Transformer fine-tuning into the \u0393-model, a variant of the Ladder network, to leverage contextual features from unlabeled data. Backpropagation simultaneously minimizes the sum of supervised and unsupervised cost functions, avoiding the need for layer-wise pre-training. The experimental results show that our model outperforms the supervised and naive semi-supervised learning baselines. Our source codes are available online.", "track": "NLP Applications", "label": 0}, {"loc": [7.596724033355713, 12.36661148071289], "id": 1390, "title": "HARALD: Augmenting Hate Speech Data Sets with Real Data", "authors": "Tal Ilan and Dan Vilenchik", "abstract": "The successful completion of the hate speech detection task hinges upon the availability of rich and variable labeled data, which is hard to obtain. In this work, we present a new approach for data augmentation that uses as input real unlabelled data, which is carefully selected from online platforms where invited hate speech is abundant. We show that by harvesting and processing this data (in an automatic manner), one can augment existing manually-labeled datasets to improve the classification performance of hate speech classification models. We observed an improvement in F1-score ranging from 2.7\\% and up to 9.5\\%, depending on the task (in- or cross-domain) and the model used.", "track": "NLP Applications", "label": 0}, {"loc": [10.561056137084961, 7.494224548339844], "id": 1419, "title": "Wait-info Policy: Balancing Source and Target at Information Level for Simultaneous Machine Translation", "authors": "Shaolei Zhang, Shoutao Guo and Yang Feng", "abstract": "Simultaneous machine translation (SiMT) outputs the translation while receiving the source inputs, and hence needs to balance the received source information and translated target information to make a reasonable decision between waiting for inputs or outputting translation. Previous methods always balance source and target information at the token level, either directly waiting for a fixed number of tokens or adjusting the waiting based on the current token. In this paper, we propose a Wait-info Policy to balance source and target at the information level. We first quantify the amount of information contained in each token, named info. Then during simultaneous translation, the decision of waiting or outputting is made based on the comparison results between the total info of previous target outputs and received source inputs. Experiments show that our method outperforms strong baselines under and achieves better balance via the proposed info.", "track": "Machine Translation", "label": 10}, {"loc": [10.554722785949707, 7.513275146484375], "id": 1424, "title": "Turning Fixed to Adaptive: Integrating Post-Evaluation into Simultaneous Machine Translation", "authors": "Shoutao Guo, Shaolei Zhang and Yang Feng", "abstract": "Simultaneous machine translation (SiMT) starts its translation before reading the whole source sentence and employs either fixed or adaptive policy to generate the target sentence. Compared to the fixed policy, the adaptive policy achieves better latency-quality tradeoffs by adopting a flexible translation policy. If the policy can evaluate rationality before taking action, the probability of incorrect actions will also decrease. However, previous methods lack evaluation of actions before taking them. In this paper, we propose a method of performing the adaptive policy via integrating post-evaluation into the fixed policy. Specifically, whenever a candidate token is generated, our model will evaluate the rationality of the next action by measuring the change in the source content. Our model will then take different actions based on the evaluation results. Experiments on three translation tasks show that our method can exceed strong baselines under all latency.", "track": "Machine Translation", "label": 10}, {"loc": [0.5531882047653198, 7.093504905700684], "id": 1446, "title": "Alleviating Sparsity of Open Knowledge Graphs with Ternary Contrastive Learning", "authors": "Qian Li, Shafiq Joty, Daling Wang, Shi Feng and Yifei Zhang", "abstract": "Sparsity of formal knowledge and roughness of non-ontological construction make sparsity problem particularly prominent in Open Knowledge Graphs (OpenKGs). Due to sparse links, learning effective representation for few-shot entities becomes difficult. We hypothesize that by introducing negative samples, a contrastive learning (CL) formulation could be beneficial in such scenarios. However, existing CL methods model KG triplets as binary objects of entities ignoring the relation-guided ternary propagation patterns and they are too generic, i.e., they ignore zero-shot, few-shot and synonymity problems that appear in OpenKGs. To address this, we propose TernaryCL, a CL framework based on ternary propagation patterns among head, relation and tail. TernaryCL designs Contrastive Entity and Contrastive Relation to mine ternary discriminative features with both negative entities and relations, introduces Contrastive Self to help zero- and few-shot entities learn discriminative features, Contrastive Synonym to model synonymous entities, and Contrastive Fusion to aggregate graph features from multiple paths. Extensive experiments on benchmarks demonstrate the superiority of TernaryCL over state-of-the-art models.", "track": "Information Extraction", "label": 5}, {"loc": [6.437849521636963, 1.9469300508499146], "id": 1462, "title": "Using Developer Discussions to Guide Fixing Bugs in Software", "authors": "Sheena Panthaplackel, Milos Gligoric, Junyi Jessy Li and Raymond Mooney", "abstract": "Automatically fixing software bugs is a challenging task. While recent work showed that natural language context is useful in guiding bug-fixing models, the approach required prompting developers to provide this context, which was simulated through commit messages written after the bug-fixing code changes were made. We instead propose using bug report discussions, which are available before the task is performed and are also naturally occurring, avoiding the need for any additional information from developers. For this, we augment standard bug-fixing datasets with bug report discussions. Using these newly compiled datasets, we demonstrate that various forms of natural language context derived from such discussions can aid bug-fixing, even leading to improved performance over using commit messages corresponding to the oracle bug-fixing commits.", "track": "NLP Applications", "label": 0}, {"loc": [5.939485549926758, 5.093125820159912], "id": 1467, "title": "AutoCAD: Automatically Generate Counterfactuals for Mitigating Shortcut Learning", "authors": "Jiaxin Wen, Yeshuang Zhu, Jinchao Zhang, Jie Zhou and Minlie Huang", "abstract": "Recent studies have shown the impressive efficacy of counterfactually augmented data (CAD) for reducing NLU models' reliance on spurious features and improving their generalizability. However, current methods still heavily rely on human efforts or task-specific designs to generate counterfactuals, thereby impeding CAD's applicability to a broad range of NLU tasks. In this paper, we present AutoCAD, a fully automatic and task-agnostic CAD generation framework. AutoCAD first leverages a classifier to unsupervisedly identify rationales as spans to be intervened, which disentangles spurious and causal features. Then, AutoCAD performs controllable generation enhanced by unlikelihood training to produce diverse counterfactuals. Extensive evaluations on multiple out-of-domain and challenge benchmarks demonstrate that AutoCAD consistently and significantly boosts the out-of-distribution performance of powerful pre-trained models across different NLU tasks, which is comparable or even better than previous state-of-the-art human-in-the-loop or task-specific CAD methods.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.596994400024414, 12.653290748596191], "id": 1471, "title": "A Multi-Modal Knowledge Graph for Classical Chinese Poetry", "authors": "Yuqing Li, Yuxin Zhang, Bin Wu, Ji-Rong Wen, Ruihua Song and Ting Bai", "abstract": "Classical Chinese poetry has a long history and is a precious cultural heritage of humankind. Displaying the classical Chinese poetry in a visual way, helps to cross cultural barriers in different countries, making it enjoyable for all the people. In this paper, we construct a multi-modal knowledge graph for classical Chinese poetry (PKG), in which the visual information of words in the poetry are incorporated. Then a multi-modal pre-training language model, PKG-Bert, is proposed to obtain the poetry representation with visual information, which bridges the semantic gap between different modalities. PKG-Bert achieves the state-of-the-art performance on the poetry-image retrieval task, showing the effectiveness of incorporating the multi-modal knowledge. The large-scale multi-modal knowledge graph of classical Chinese poetry will be released to promote the researches in classical Chinese culture area.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [10.59830379486084, 7.6844377517700195], "id": 1474, "title": "Assessing Non-autoregressive Alignment in Neural Machine Translation via Word Reordering", "authors": "Chun-Hin Tse, Ester Leung and William K. Cheung", "abstract": "Recent work on non-autoregressive neural machine translation (NAT) that leverages alignment information to explicitly reduce the modality of target distribution has reported comparable performance with counterparts that tackle multi-modality problem by implicitly modeling dependencies. Effectiveness in handling alignment is vital for models that follow this approach, where a token reordering mechanism is typically involved and plays a vital role. We review the reordering capability of the respective mechanisms in recent NAT models, and our experimental results show that their performance is sub-optimal. We propose to learn a non-autoregressive language model (NALM) based on transformer which can be combined with Viterbi decoding to achieve better reordering performance. We evaluate the proposed NALM using the PTB dataset where sentences with words permuted in different ways are expected to have their ordering recovered. Our empirical results show that the proposed method can outperform the state-of-the-art reordering mechanisms under different word permutation settings, with a 2-27 BLEU improvement, suggesting high potential for word alignment in NAT.", "track": "Machine Translation", "label": 10}, {"loc": [10.433928489685059, 7.10748291015625], "id": 1492, "title": "Syntax-guided Localized Self-attention by Constituency Syntactic Distance", "authors": "Shengyuan Hou, Jushi Kai, Haotian Xue, Bingyu Zhu, Bo Yuan, Longtao Huang, Xinbing Wang and Zhouhan Lin", "abstract": "Recent works have revealed that Transformers are implicitly learning the syntactic information in its lower layers from data, albeit is highly dependent on the quality and scale of the training data. However, learning syntactic information from data is not necessary if we can leverage an external syntactic parser, which provides better parsing quality with well-defined syntactic structures. This could potentially improve Transformer's performance and sample efficiency. In this work, we propose a syntax-guided localized self-attention for Transformer that allows directly incorporating grammar structures from an external constituency parser. It prohibits the attention mechanism to overweight the grammatically distant tokens over close ones. Experimental results show that our model could consistently improve translation performance on a variety of machine translation datasets, ranging from small to large dataset sizes, and with different source languages.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [6.471464157104492, 1.907331109046936], "id": 1495, "title": "CodeExp: Explanatory Code Document Generation", "authors": "Haotian Cui, Chenglong Wang, Junjie Huang, Jeevana Priya Inala, Todd Mytkowicz, Bo Wang, Jianfeng Gao and Nan Duan", "abstract": "Developing models that can automatically generate detailed code explanation can greatly benefit software maintenance and programming education. However, existing code-to-text generation models often produce only high-level summaries of code that do not capture implementation-level choices essential for these scenarios. To fill in this gap, we propose the code explanation generation task. We first conducted a human study to identify the criteria for high-quality explanatory docstring for code. Based on that, we collected and refined a large-scale code docstring corpus and formulated automatic evaluation metrics that best match human assessments. Finally, we present a multi-stage fine-tuning strategy and baseline models for the task. Our experiments show that (1) our refined training dataset lets models achieve better performance in the explanation generation tasks compared to larger-scale unrefined data (15x larger), and (2) fine-tuned models can generate well-structured long docstrings comparable to human-written ones. We envision our training dataset, human-evaluation protocol, recommended metrics, and fine-tuning strategy can boost future code explanation research. The code and annotated data are available at https://github.com/subercui/CodeExp.", "track": "NLP Applications", "label": 0}, {"loc": [1.3400423526763916, 4.931199073791504], "id": 1497, "title": "PAUQ: Text-to-SQL in Russian", "authors": "Daria Bakshandaeva, Oleg Dmitrievich Somov, Ekaterina Dmitrieva, Vera Davydova and Elena Tutubalina", "abstract": "Semantic parsing is an important task that allows to democratize human-computer interaction. One of the most popular text-to-SQL datasets with complex and diverse natural language (NL) questions and SQL queries is Spider. We construct and complement a Spider dataset for Russian, thus creating the first publicly available text-to-SQL dataset for this language. While examining its components - NL questions, SQL queries and databases content - we identify limitations of the existing database structure, fill out missing values for tables and add new requests for underrepresented categories. We select thirty functional test sets with different features that can be used for the evaluation of neural models' abilities. To conduct the experiments, we adapt baseline architectures RAT-SQL and BRIDGE and provide in-depth query component analysis. On the target language, both models demonstrate strong results with monolingual training and improved accuracy in multilingual scenario. In this paper, we also study trade-offs between machine-translated and manually-created NL queries. At present, Russian text-to-SQL is lacking in datasets as well as trained models, and we view this work as an important step towards filling this gap.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.553697109222412, 7.074418067932129], "id": 1501, "title": "Event-Centric Question Answering via Contrastive Learning and Invertible Event Transformation", "authors": "Junru Lu, Xingwei Tan, Gabriele Pergola, Lin Gui and Yulan He", "abstract": "Human reading comprehension often requires reasoning of event semantic relations in narratives, represented by Event-centric Question-Answering (QA). To address event-centric QA, we propose a novel QA model with contrastive learning and invertible event transformation, call TranCLR. Our proposed model utilizes an invertible transformation matrix to project semantic vectors of events into a common event embedding space, trained with contrastive learning, and thus naturally inject event semantic knowledge into mainstream QA pipelines. The transformation matrix is fine-tuned with the annotated event relation types between events that occurred in questions and those in answers, using event-aware question vectors. Experimental results on the Event Semantic Relation Reasoning (ESTER) dataset show significant improvements in both generative and extractive settings compared to the existing strong baselines, achieving over 8.4% gain in the token-level F1 score and 3.0% gain in Exact Match (EM) score under the multi-answer setting. Qualitative analysis reveals the high quality of the generated answers by TranCLR, demonstrating the feasibility of injecting event knowledge into QA model learning. Our code and models can be found at https://github.com/LuJunru/TranCLR.", "track": "Question Answering", "label": 11}, {"loc": [1.0047622919082642, 10.51447868347168], "id": 1523, "title": "Label-Driven Denoising Framework for Multi-Label Few-Shot Aspect Category Detection", "authors": "Fei Zhao, Yuchen Shen, Zhen Wu and Xinyu Dai", "abstract": "Multi-Label Few-Shot Aspect Category Detection (FS-ACD) is a new sub-task of aspect-based sentiment analysis, which aims to detect aspect categories accurately with limited training instances. Recently, dominant works use the prototypical network to accomplish this task, and employ the attention mechanism to extract keywords of aspect category from the sentences to produce the prototype for each aspect. However, they still suffer from serious noise problems: (1) due to lack of sufficient supervised data, the previous methods easily catch noisy words irrelevant to the current aspect category, which largely affects the quality of the generated prototype; (2) the semantically-close aspect categories usually generate similar prototypes, which are mutually noisy and confuse the classifier seriously. In this paper, we resort to the label information of each aspect to tackle the above problems, along with proposing a novel Label-Driven Denoising Framework (LDF). Extensive experimental results show that our framework achieves better performance than other state-of-the-art methods.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.471920967102051, 12.70667552947998], "id": 1537, "title": "Visual Named Entity Linking: A New Dataset and A Baseline", "authors": "Wen Xiang Sun, Yixing Fan, Jiafeng Guo, Ruqing Zhang and Xueqi Cheng", "abstract": "Visual Entity Linking (VEL) is a task to link regions of images with their corresponding entities in Knowledge Bases (KBs), which is beneficial for many computer vision tasks such as image retrieval, image caption, and visual question answering. While existing tasks in VEL either rely on textual data to complement a multi-modal linking or only link objects with general entities, which fails to perform named entity linking on large amounts of image data. In this paper, we consider a purely Visual-based Named Entity Linking (VNEL) task, where the input only consists of an image. The task is to identify objects of interest (i.e., visual entity mentions) in images and link them to corresponding named entities in KBs. Since each entity often contains rich visual and textual information in KBs, we thus propose three different sub-tasks, i.e., visual to visual entity linking (V2VEL), visual to textual entity linking (V2TEL), and visual to visual-textual entity linking (V2VTEL). In addition, we present a high-quality human-annotated visual person linking dataset, named WIKIPerson. Based on WIKIPerson, we establish a series of baseline algorithms for the solution of each sub-task, and conduct experiments to verify the quality of the proposed datasets and the effectiveness of baseline methods. We envision this work to be helpful for soliciting more works regarding VNEL in the future. The codes and datasets are publicly available at https: //github.com/ict-bigdatalab/VNEL.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.4395551681518555, 12.070502281188965], "id": 1544, "title": "MAGMA \u2013 Multimodal Augmentation of Generative Models through Adapter-based Finetuning", "authors": "Constantin Eichenberg, Sidney Black, Samuel Weinbach, Letitia Parcalabescu and Anette Frank", "abstract": "Large-scale pretraining is fast becoming the norm in Vision-Language (VL) modeling. However, prevailing VL approaches are limited by the requirement for labeled data and the use of complex multi-step pretraining objectives. We present MAGMA - a simple method for augmenting generative language models with additional modalities using adapter-based finetuning. Building on Frozen, we train a series of VL models that autoregressively generate text from arbitrary combinations of visual and textual input. The pretraining is entirely end-to-end using a single language modeling objective, simplifying optimization compared to previous approaches. Importantly, the language model weights remain unchanged during training, allowing for transfer of encyclopedic knowledge and in-context learning abilities from language pretraining. MAGMA outperforms Frozen on open-ended generative tasks, achieving state of the art results on the OKVQA benchmark and competitive results on a range of other popular VL benchmarks, while pretraining on 0.2 % of the number of samples used to train SimVLM.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.0037617683410645, 3.800354242324829], "id": 1549, "title": "Towards Tracing Knowledge in Language Models Back to the Training Data", "authors": "Ekin Akyurek, Tolga Bolukbasi, Frederick Liu, Binbin Xiong, Ian Tenney, Jacob Andreas and Kelvin Guu", "abstract": "Language models (LMs) have been shown to memorize a great deal of factual knowledge contained in their training data. But when an LM generates an assertion, it is often difficult to determine where it learned this information and whether it is true. In this paper, we propose the problem of fact tracing: identifying which training examples taught an LM to generate a particular factual assertion. Prior work on training data attribution (TDA) may offer effective tools for identifying such examples, known as \"proponents\u201d. We present the first quantitative benchmark to evaluate this. We compare two popular families of TDA methods \u2014 gradient-based and embedding-based \u2014 and find that much headroom remains. For example, both methods have lower proponent-retrieval precision than an information retrieval baseline (BM25) that does not have access to the LM at all. We identify key challenges that may be necessary for further improvement such as overcoming the problem of gradient saturation, and also show how several nuanced implementation details of existing neural TDA methods can significantly improve overall fact tracing performance.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.6655534505844116, 5.417201042175293], "id": 1550, "title": "ReaRev: Adaptive Reasoning for Question Answering over Knowledge Graphs", "authors": "Costas Mavromatis and George Karypis", "abstract": "Knowledge Graph Question Answering (KGQA) involves retrieving entities as answers from a Knowledge Graph (KG) using natural language queries. The challenge is to learn to reason over question-relevant KG facts that traverse KG entities and lead to the question answers. \nTo facilitate reasoning, the question is decoded into instructions, which are dense question representations used to guide the KG traversals. However, if the derived instructions do not exactly match the underlying KG information, they may lead to reasoning under irrelevant context.\nOur method, termed ReaRev, introduces a new way to KGQA reasoning with respect\nto both instruction decoding and execution. To improve instruction decoding, we perform reasoning in an adaptive manner, where KG-aware information is used to iteratively update the initial instructions. To improve instruction execution, we emulate breadth-first search (BFS) with graph neural networks (GNNs). The BFS strategy treats the instructions as a set and allows our method to decide on their execution order on the fly. Experimental results on three KGQA benchmarks demonstrate the ReaRev's effectiveness compared with previous state-of-the-art, especially when the KG is incomplete or when we tackle complex questions. Our code is publicly available at https://github.com/cmavro/ReaRev_KGQA.", "track": "Question Answering", "label": 11}, {"loc": [5.630853652954102, 12.347098350524902], "id": 1565, "title": "Understanding Social Media Cross-Modality Discourse in Linguistic Space", "authors": "Chunpu Xu, Hanzhuo Tan, Jing Li and Piji Li", "abstract": "The multimedia communications with texts and images are popular on social media. However, limited studies concern how images are structured with texts to form coherent meanings in human cognition. To fill in the gap, we present a novel concept of cross-modality discourse, reflecting how human readers couple image and text understandings. Text descriptions are first derived from images (named as subtitles) in the multimedia contexts. Five labels -- entity-level insertion, projection and concretization and scene-level restatement and extension --- are further employed to shape the structure of subtitles and texts and present their joint meanings. As a pilot study, we also build the very first dataset containing over 16K multimedia tweets with manually annotated discourse labels. The experimental results show that trendy multimedia encoders based on multi-head attention (with captions) are unable to well understand cross-modality discourse and additionally modeling texts at the output layer helps yield the-state-of-the-art results.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.378390789031982, 3.903184413909912], "id": 1586, "title": "TAPE: Assessing Few-shot Russian Language Understanding", "authors": "Ekaterina Taktasheva, Alena Fenogenova, Denis Shevelev, Nadezhda Katricheva, Maria Tikhonova, Albina Ramilevna Akhmetgareeva, Oleg Vadimovich Zinkevich, Anastasiia Y. Bashmakova, Svetlana V. Iordanskaia, Valentina Kurenshchikova, Alena Spiridonova, Ekaterina Artemova, Tatiana Shavrina and Vladislav Mikhailov", "abstract": "Recent advances in zero-shot and few-shot learning have shown promise for a scope of research and practical purposes. However, this fast-growing area lacks standardized evaluation suites for non-English languages, hindering progress outside the Anglo-centric paradigm. To address this line of research, we propose TAPE (Text Attack and Perturbation Evaluation), a novel benchmark that includes six more complex NLU tasks for Russian, covering multi-hop reasoning, ethical concepts, logic and commonsense knowledge. The TAPE's design focuses on systematic zero-shot and few-shot NLU evaluation: (i) linguistic-oriented adversarial attacks and perturbations for analyzing robustness, and (ii) subpopulations for nuanced interpretation. The detailed analysis of testing the autoregressive baselines indicates that simple spelling-based perturbations affect the performance the most, while paraphrasing the input has a more negligible effect. At the same time, the results demonstrate a significant gap between the neural and human baselines for most tasks. We publicly release TAPE (https://tape-benchmark.com) to foster research on robust LMs that can generalize to new tasks when little to no supervision is available.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.5753600001335144, 7.239218235015869], "id": 1598, "title": "A Hierarchical N-Gram Framework for Zero-Shot Link Prediction", "authors": "Mingchen Li, Junfan Chen, Samuel Mensah, Nikolaos Aletras, Xiulong Yang and yang ye", "abstract": "Knowledge graphs typically contain a large number of entities but often cover only a fraction of all relations between them (i.e., incompleteness). Zero-shot link prediction (ZSLP) is a popular way to tackle the problem by automatically identifying unobserved relations between entities. Most recent approaches use textual features of relations (e.g., surface name or textual descriptions) as auxiliary information to improve the encoded representation. These methods lack robustness as they are bound to support only tokens from a fixed vocabulary and unable to model out-of-vocabulary (OOV) words. Subword units such as character n-grams have the capability of generating more expressive representations for OOV words. Hence, in this paper, we propose a {\\bf H}ierarchical {\\bf N}-gram framework for {\\bf Z}ero-{\\bf S}hot {\\bf L}ink {\\bf P}rediction (HNZSLP) that leverages character n-gram information for ZSLP. Our approach works by first constructing a hierarchical n-gram graph from the surface name of relations. Subsequently, a new Transformer-based network models the hierarchical n-gram graph to learn a relation embedding for ZSLP. Experimental results show that our proposed HNZSLP method achieves state-of-the-art performance on two standard ZSLP datasets.", "track": "NLP Applications", "label": 0}, {"loc": [8.72663402557373, 8.185297966003418], "id": 1600, "title": "Quadapter: Adapter for GPT-2 Quantization", "authors": "Minseop Park, Jaeseong You, Markus Nagel and Simyung Chang", "abstract": "Transformer language models such as GPT-2 are difficult to quantize because of outliers in the activations leading to a large quantization error. To adapt to the error, one must use quantization-aware training, which entails a fine-tuning process based on the dataset and the training pipeline identical to those for the original model. Pretrained language models, however, often do not grant access to their datasets and training pipelines, forcing us to rely on arbitrary ones for fine-tuning. In that case, it is observed that quantization-aware training overfits the model to the fine-tuning data. To this end introduced is a quantization adapter (Quadapter), a small set of parameters that are learned to make activations quantization-friendly by scaling them channel-wise.\nFor quantization without overfitting, we introduce a quantization adapter (Quadapter), a small set of parameters that are learned to make activations quantization-friendly by scaling them channel-wise. It keeps the model parameters unchanged. By applying our method to the challenging task of quantizing GPT-2, we demonstrate that it effectively prevents the overfitting and improves the quantization performance.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [2.838000774383545, 4.6923441886901855], "id": 1613, "title": "BanglaRQA: A Benchmark Dataset for Under-resourced Bangla Language Reading Comprehension-based Question Answering with Diverse Question-Answer Types", "authors": "Syed Mohammed Sartaj Ekram, Adham Arik Rahman, Md. Sajid Altaf, Mohammed Saidul Islam, Mehrab Mustafy Rahman, Md. Mezbaur Rahman, Md Azam Hossain and Abu Raihan Mostofa Kamal", "abstract": "High-resource languages, such as English, have access to a plethora of datasets with various question-answer types resembling real-world reading comprehension. However, there is a severe lack of diverse and comprehensive question-answering datasets in under-resourced languages like Bangla. The ones available are either translated versions of English datasets with a niche answer format or created by human annotations focusing on a specific domain, question type, or answer type. To address these limitations, this paper introduces BanglaRQA, a reading comprehension-based Bangla question-answering dataset with various question-answer types. BanglaRQA consists of 3,000 context passages and 14,889 question-answer pairs created from those passages. The dataset comprises answerable and unanswerable questions covering four unique categories of questions and three types of answers. In addition, this paper also implemented four different Transformer models for question-answering on the proposed dataset. The best-performing model achieved an overall 62.42% EM and 78.11% F1 score. However, detailed analyses showed that the performance varies across question-answer types, leaving room for substantial improvement of the model performance. Furthermore, we demonstrated the effectiveness of BanglaRQA as a training resource by showing strong results on the bn_squad dataset. Therefore, BanglaRQA has the potential to contribute to the advancement of future research by enhancing the capability of language models. The dataset and codes are available at https://github.com/sartajekram419/BanglaRQA", "track": "Question Answering", "label": 11}, {"loc": [4.215872287750244, 3.9530646800994873], "id": 1614, "title": "Chaining Simultaneous Thoughts for Numerical Reasoning", "authors": "Zhihong Shao, Fei Huang and Minlie Huang", "abstract": "Given that rich information is hidden behind ubiquitous numbers in text, numerical reasoning over text should be an essential skill of AI systems. To derive precise equations to solve numerical reasoning problems, previous work focused on modeling the structures of equations, and has proposed various structured decoders. Though structure modeling proves to be effective, these structured decoders construct a single equation in a pre-defined autoregressive order, potentially placing an unnecessary restriction on how a model should grasp the reasoning process. Intuitively, humans may have numerous pieces of thoughts popping up in no pre-defined order; thoughts are not limited to the problem at hand, and can even be concerned with other related problems. By comparing diverse thoughts and chaining relevant pieces, humans are less prone to errors. In this paper, we take this inspiration and propose CANTOR, a numerical reasoner that models reasoning steps using a directed acyclic graph where we produce diverse reasoning steps simultaneously without pre-defined decoding dependencies, and compare and chain relevant ones to reach a solution. Extensive experiments demonstrated the effectiveness of CANTOR under both fully-supervised and weakly-supervised settings.", "track": "Question Answering", "label": 11}, {"loc": [4.269927501678467, 4.329064846038818], "id": 1619, "title": "Inferring Implicit Relations in Complex Questions with Language Models", "authors": "Uri Katz, Mor Geva and Jonathan Berant", "abstract": "A prominent challenge for modern language understanding systems is the ability to answer implicit reasoning questions, where the required reasoning steps for answering the question are not mentioned in the text explicitly. In this work, we investigate why current models struggle with implicit reasoning question answering (QA) tasks, by decoupling inference of reasoning steps from their execution.\nWe define a new task of implicit relation inference and construct a benchmark, IMPLICITRELATIONS, where given a question, a model should output a list of concept-relation pairs, where the relations describe the implicit reasoning steps required for answering the question.\nUsing IMPLICITRELATIONS, we evaluate models from the GPT-3 family and find that, while these models struggle on the implicit reasoning QA task, they often succeed at inferring implicit relations.\nThis suggests that the challenge in implicit reasoning questions does not stem from the need to plan a reasoning strategy alone, but to do it while also retrieving and reasoning over relevant information.", "track": "Question Answering", "label": 11}, {"loc": [7.988410472869873, 8.671518325805664], "id": 1649, "title": "Eliciting and Understanding Cross-task Skills with Task-level Mixture-of-Experts", "authors": "Qinyuan Ye, Juan Zha and Xiang Ren", "abstract": "Recent works suggest that transformer models are capable of multi-tasking on diverse NLP tasks and adapt to new tasks efficiently. However, the potential of these multi-task models may be limited as they use the same set of parameters for all tasks. In contrast, humans tackle tasks in a more flexible way, by making proper presumptions on what skills and knowledge are relevant and executing only the necessary computations. Inspired by this, we propose to use task-level mixture-of-expert models, which has a collection of transformer layers (i.e., experts) and a router component to choose among these experts dynamically and flexibly. We find that these models help improve the average performance gain (ARG) metric by 2.6% when adapting to unseen tasks in few-shot settings, and by 5.6% in zero-shot generalization settings. Further, we show that the learned routing decisions and experts partly rediscover human categorization of NLP tasks -- certain experts are strongly associated with extractive tasks, some with classification tasks, and some with tasks requiring world knowledge.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.225702285766602, 5.80625581741333], "id": 1650, "title": "On the Curious Case of l2 norm of Sense Embeddings", "authors": "Yi Zhou and Danushka Bollegala", "abstract": "We show that the l2 norm of a static sense embedding encodes information related to the frequency of that sense in the training corpus used to learn the sense embeddings. This finding can be seen as an extension of a previously known relationship for word embeddings to sense embeddings. Our experimental results show that in spite of its simplicity, the l2 norm of sense embeddings is a surprisingly effective feature for several word sense related tasks such as (a) most frequent sense prediction, (b) word-in-context (WiC), and (c) word sense disambiguation (WSD). In particular, by simply including the l2 norm of a sense embedding as a feature in a classifier, we show that we can improve WiC and WSD methods that use static sense embeddings.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [8.729771614074707, 8.125304222106934], "id": 1653, "title": "Partially-Random Initialization: A Smoking Gun for Binarization Hypothesis of BERT", "authors": "Arash Ardakani", "abstract": "In the past few years, pre-trained BERT has become one of the most popular deep-learning language models due to their remarkable performance in natural language processing (NLP) tasks. However, the superior performance of BERT comes at the cost of high computational and memory complexity, hindering its envisioned widespread deployment in edge devices with limited computing resources. Binarization can alleviate these limitations by reducing storage requirements and improving computing performance. However, obtaining a comparable accuracy performance for binary BERT w.r.t. its full-precision counterpart is still a difficult task. We observe that direct binarization of pre-trained BERT provides a poor initialization during the fine-tuning phase, making the model incapable of achieving a decent accuracy on downstream tasks. Based on this observation, we put forward the following \\textit{hypothesis}: partially randomly-initialized BERT with binary weights and activations can reach to a decent accuracy performance by distilling knowledge from the its full-precision counterpart. We show that BERT with pre-trained embedding layer and randomly-initialized encoder is a smoking gun for this hypothesis. We identify the smoking gun through a series of experiments and show that it yields a new set of state-of-the-art results on the GLUE and SQuAD benchmarks.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.028206825256348, 9.455389976501465], "id": 1657, "title": "Prompt Consistency for Zero-Shot Task Generalization", "authors": "Chunting Zhou, Junxian He, Xuezhe Ma, Taylor Berg-Kirkpatrick and Graham Neubig", "abstract": "One of the most impressive results of recent NLP history is the ability of pre-trained language models to solve new tasks in a zero-shot setting. To achieve this, NLP tasks are framed as natural language prompts, generating a response indicating the predicted output. Nonetheless, the performance in such settings often lags far behind its supervised counterpart, suggesting a large space for potential improvement. In this paper, we explore methods to utilize unlabeled data to improve zero-shot performance. Specifically, we take advantage of the fact that multiple prompts can be used to specify a single task, and propose to regularize prompt consistency, encouraging consistent predictions over this diverse set of prompts. Our method makes it possible to fine-tune the model either with extra unlabeled training data, or directly on test input at inference time in an unsupervised manner. In experiments, our approach outperforms the state-of-the-art zero-shot learner, T0, on 9 out of 11 datasets across 4 NLP tasks by up to 10.6 absolute points in terms of accuracy. The gains are often attained with a small number of unlabeled examples.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.579060077667236, 7.721115589141846], "id": 1659, "title": "In-Context Learning for Few-Shot Dialogue State Tracking", "authors": "Yushi Hu, Chia-Hsuan Lee, Tianbao Xie, Tao Yu, Noah A. Smith and Mari Ostendorf", "abstract": "Collecting and annotating task-oriented dialogues is time-consuming and costly. Thus, zero and few shot learning for dialogue tasks presents an exciting opportunity. In this work, we propose an in-context (IC) learning framework for zero-shot and few-shot learning dialogue state tracking (DST), where a large pretrained language model (LM) takes a test instance and a few exemplars as input, and directly decodes the dialogue state without any parameter updates. This approach is more flexible and scalable than prior DST work when adapting to new domains and scenarios. To better leverage a tabular domain description in the LM prompt, we reformulate DST into a text-to-SQL problem. We also propose a novel approach to retrieve annotated dialogues as exemplars. Empirical results on MultiWOZ show that our method IC-DST substantially outperforms previous fine-tuned state-of-the-art models in few-shot settings. In addition, we test IC-DST in zero-shot settings, in which the model only takes a fixed task instruction as input, finding that it outperforms previous zero-shot methods by a large margin.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.280877590179443, 12.220988273620605], "id": 1665, "title": "On Advances in Text Generation from Images Beyond Captioning: A Case Study in Self-Rationalization", "authors": "Shruti Palaskar, Akshita Bhagia, Yonatan Bisk, Florian Metze, Alan W Black and Ana Marasovic", "abstract": "Combining the visual modality with pretrained language models has been surprisingly effective for simple descriptive tasks such as image captioning. More general text generation however remains elusive. We take a step back and ask: How do these models work for more complex generative tasks, i.e. conditioning on both text and images? Are multimodal models simply visually adapted language models, or do they combine they reason jointly over modalities?\n\nWe investigate these questions in the context of self-rationalization (jointly generating task labels/answers and free-text explanations) of three tasks: (i) visual question answering in VQA-X, (ii) visual commonsense reasoning in VCR, and (iii) visual-textual entailment in E-SNLI-VE. We show that recent unimodal advances, CLIP image representations and scaling of language models, do not consistently improve\nself-rationalization in multimodal tasks. We find that no single model type works universally best across tasks, datasets, and finetuning data sizes. Our \ufb01ndings motivate the need for novel general backbones that move text generation from images and text beyond image captioning.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.87336254119873, 7.232455730438232], "id": 1669, "title": "The challenges of temporal alignment on Twitter during crises", "authors": "Aniket Pramanick, Tilman Beck, Kevin Stowe and Iryna Gurevych", "abstract": "Language use changes over time, and this impacts the effectiveness of NLP systems. This phenomenon is even more prevalent in social media data during crisis events where meaning and frequency of word usage may change over the course of days. Contextual language models fail to adapt temporally, emphasizing the need for temporal adaptation in models which need to be deployed over an extended period of time. While existing approaches consider data spanning large periods of time (from years to decades), shorter time spans are critical for crisis data. We quantify temporal degradation for this scenario and propose methods to cope with performance loss by leveraging techniques from domain adaptation. To the best of our knowledge, this is the first effort to explore effects of rapid language change driven by adversarial adaptations, particularly during natural and human-induced disasters. Through extensive experimentation on diverse crisis datasets, we analyze under what conditions our approaches outperform strong baselines while highlighting the current limitations of temporal adaptation methods in scenarios where access to unlabeled data is scarce.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [8.210074424743652, 7.2390618324279785], "id": 1670, "title": "Experimental Standards for Deep Learning in Natural Language Processing Research", "authors": "Dennis Ulmer, Elisa Bassignana, Max M\u00fcller-Eberstein, Daniel Varab, Mike Zhang, Rob van der Goot, Christian Hardmeier and Barbara Plank", "abstract": "The field of Deep Learning (DL) has undergone explosive growth during the last decade, with a substantial impact on Natural Language Processing (NLP) as well. Yet, compared to more established disciplines, a lack of common experimental standards remains an open challenge to the field at large. Starting from fundamental scientific principles, we distill ongoing discussions on experimental standards in NLP into a single, widely-applicable methodology. Following these best practices is crucial to strengthen experimental evidence, improve reproducibility and enable scientific progress. These standards are further collected in a public repository to help them transparently adapt to future needs.", "track": "Theme Track", "label": 18}, {"loc": [2.397756814956665, 8.213961601257324], "id": 1671, "title": "Few-Shot Anaphora Resolution in Scientific Protocols via Mixtures of In-Context Experts", "authors": "Nghia T. Le, Fan Bai and Alan Ritter", "abstract": "Anaphora resolution is an important task for information extraction across a range of languages, text genres, and domains, motivating the need for methods that do not require large annotated datasets. In-context learning has emerged as a promising approach, yet there are a number of challenges in applying in-context learning to resolve anaphora. For example, encoding a single in-context demonstration that consists of: an anaphor, a paragraph-length context, and a list of corresponding antecedents, requires conditioning a language model on a long sequence of tokens, limiting the number of demonstrations per prompt.\nIn this paper, we present Mice (Mixtures of In-Context Experts), which we demonstrate is effective for few-shot anaphora resolution in scientific protocols. Given only a handful of training examples, Mice combines the predictions of hundreds of in-context experts, yielding a 30% increase in F1 score over a competitive prompt retrieval baseline. Furthermore, we show Mice can be used to train compact student models without sacrificing performance. As far as we are aware, this is the first work to present experimental results demonstrating the effectiveness of in-context learning on the task of few-shot anaphora resolution in scientific protocols.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.435981750488281, 7.397261619567871], "id": 1673, "title": "Exploring Predictive Uncertainty and Calibration in NLP: A Study on the Impact of Method & Data Scarcity", "authors": "Dennis Ulmer, Jes Frellsen and Christian Hardmeier", "abstract": "We investigate the problem of determining the predictive confidence (or, conversely, uncertainty) of a neural classifier through the lens of low-resource languages. By training models on sub-sampled datasets in three different languages, we assess the quality of estimates from a wide array of approaches and their dependence on the amount of available data. We find that while approaches based on pre-trained models and ensembles achieve the best results overall, the quality of uncertainty estimates can surprisingly suffer with more data. We also perform a qualitative analysis of uncertainties on sequences, discovering that a model's total uncertainty seems to be influenced to a large degree by its data uncertainty, not model uncertainty. All model implementations are open-sourced in a software package.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.3040852546691895, 5.44805383682251], "id": 1705, "title": "Conditional Supervised Contrastive Learning for Fair Text Classification", "authors": "Jianfeng Chi, William Shand, Yaodong Yu, Kai-Wei Chang, Han Zhao and Yuan Tian", "abstract": "Contrastive representation learning has gained much attention due to its superior performance in learning representations from both image and sequential data. However, the learned representations could potentially lead to performance disparities in downstream tasks, such as increased silencing of underrepresented groups in toxicity comment classification. In light of this challenge, in this work, we study learning fair representations that satisfy a notion of fairness known as equalized odds for text classification via contrastive learning. Specifically, we first theoretically analyze the connections between learning representations with a fairness constraint and conditional supervised contrastive objectives, and then propose to use conditional supervised contrastive objectives to learn fair representations for text classification. We conduct experiments on two text datasets to demonstrate the effectiveness of our approaches in balancing the trade-offs between task performance and bias mitigation among existing baselines for text classification. Furthermore, we also show that the proposed methods are stable in different hyperparameter settings.", "track": "Ethics", "label": 21}, {"loc": [1.5787986516952515, 8.570415496826172], "id": 1717, "title": "SpaBERT: A Pretrained Language Model from Geographic Data for Geo-Entity Representation", "authors": "Zekun Li, Jina Kim, Yao-Yi Chiang and Muhao Chen", "abstract": "Named geographic entities (geo-entities for short) are the building blocks of many geographic datasets. Characterizing geo-entities is integral to various application domains, such as geo-intelligence and map comprehension, while a key challenge is to capture the spatial-varying context of an entity. We hypothesize that we shall know the characteristics of a geo-entity by its surrounding entities, similar to knowing word meanings by their linguistic context. Accordingly, we propose a novel spatial language model, SpaBERT, which provides a general-purpose geo-entity representation based on neighboring entities in geospatial data. SpaBERT extends BERT to capture linearized spatial context, while incorporating a spatial coordinate embedding mechanism to preserve spatial relations of entities in the 2-dimensional space. SpaBERT is pretrained with masked language modeling and masked entity prediction tasks to learn spatial dependencies. We apply SpaBERT to two downstream tasks: geo-entity typing and geo-entity linking. Compared with the existing language models that do not use spatial context, SpaBERT shows significant performance improvement on both tasks. We also analyze the entity representation from SpaBERT in various settings and the effect of spatial coordinate embedding.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.225561618804932, 7.505403518676758], "id": 1732, "title": "Self-training with Two-phase Self-augmentation for Few-shot Dialogue Generation", "authors": "Wanyu Du, Hanjie Chen and Yangfeng Ji", "abstract": "In task-oriented dialogue systems, response generation from meaning representations (MRs) often suffers from limited training examples, due to the high cost of annotating MR-to-Text pairs. Previous works on self-training leverage fine-tuned conversational models to automatically generate pseudo-labeled MR-to-Text pairs for further fine-tuning. However, some self-augmented data may be noisy or uninformative for the model to learn from. In this work, we propose a two-phase self-augmentation procedure to generate high-quality pseudo-labeled MR-to-Text pairs: the first phase selects the most informative MRs based on model's prediction uncertainty; with the selected MRs, the second phase generates accurate responses by aggregating multiple perturbed latent representations from each MR. Empirical experiments on two benchmark datasets, FewShotWOZ and FewShotSGD, show that our method generally outperforms existing self-training methods on both automatic and human evaluations.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.286637306213379, 7.1084136962890625], "id": 1741, "title": "Is NLP Ready for Standardization?", "authors": "Lauriane Aufrant", "abstract": "While standardization is a well-established activity in other scientific fields such as telecommunications, networks or multimedia, in the field of AI and more specifically NLP it is still at its dawn. In this paper, we explore how various aspects of NLP (evaluation, data, tasks...) lack standards and how that can impact science, but also the society, the industry, and regulations. We argue that the numerous initiatives to rationalize the field and establish good practices are only the first step, and developing formal standards remains needed to bring further clarity to NLP research and industry, at a time where this community faces various crises regarding ethics or reproducibility. We thus encourage NLP researchers to contribute to existing and upcoming standardization projects, so that they can express their needs and concerns, while sharing their expertise.", "track": "Theme Track", "label": 18}, {"loc": [6.862176418304443, 6.41918420791626], "id": 1744, "title": "Probing for Incremental Parse States in Autoregressive Language Models", "authors": "Tiwalayo Eisape, Vineet Gangireddy, Roger Levy and Yoon Kim", "abstract": "Next-word predictions from autoregressive neural language models show remarkable sensitivity to syntax. This work evaluates the extent to which this behavior arises as a result of a learned ability to maintain implicit representations of incremental syntactic structures. We extend work in syntactic probing to the incremental setting and present several probes for extracting incomplete syntactic structure (operationalized through parse states from a stack-based parser) from autoregressive language models. We find that our probes can be used to predict model preferences on ambiguous sentence prefixes and causally intervene on model representations and steer model behavior. This suggests implicit incremental syntactic inferences underlie next-word predictions in autoregressive neural language models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.5482852458953857, 4.426609516143799], "id": 1761, "title": "Re-Examining Calibration: The Case of Question Answering", "authors": "Chenglei Si, Chen Zhao, Sewon Min and Jordan Boyd-Graber", "abstract": "For users to trust model predictions, they need to understand model outputs, particularly their confidence \u2014 calibration aims to adjust (calibrate) models' confidence to match expected accuracy. We argue that the traditional calibration evaluation does not promote effective calibrations: for example, it can encourage always assigning a mediocre confidence score to all predictions, which does not help users distinguish correct predictions from wrong ones. Building on those observations, we propose a new calibration metric, MacroCE, that better captures whether the model assigns low confidence to wrong predictions and high confidence to correct predictions. Focusing on the practical application of open-domain question answering, we examine conventional calibration methods applied on the widely-used retriever-reader pipeline, all of which do not bring significant gains under our new MacroCE metric. Toward better calibration, we propose a new calibration method (ConsCal) that uses not just final model predictions but whether multiple model checkpoints make consistent predictions. Altogether, we provide an alternative view of calibration along with a new metric, re-evaluation of existing calibration methods on our metric, and proposal of a more effective calibration method.", "track": "Question Answering", "label": 11}, {"loc": [1.714347243309021, 3.802700996398926], "id": 1764, "title": "Accelerating Learned Sparse Indexes Via Term Impact Decomposition", "authors": "Joel Mackenzie, Antonio Mallia, Alistair Moffat and Matthias Petri", "abstract": "Novel inverted index-based learned sparse ranking models provide more effective, but less efficient, retrieval performance compared to traditional ranking models like BM25. In this paper, we introduce a technique we call postings clipping to improve the query efficiency of learned representations. Our technique amplifies the benefit of dynamic pruning query processing techniques by accounting for changes in term importance distributions of learned ranking models. The new clipping mechanism accelerates top-k retrieval by up to 9.6X without any loss in effectiveness.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.631075859069824, 8.593328475952148], "id": 1767, "title": "Do Text-to-Text Multi-Task Learners Suffer from Task Conflict?", "authors": "David Mueller, Nicholas Andrews and Mark Dredze", "abstract": "Traditional multi-task learning architectures learn a single model across multiple tasks through a shared encoder followed by task-specific decoders. Learning these models often requires specialized training algorithms that address task-conflict in the shared parameter updates, which otherwise can lead to negative transfer. A new type of multi-task learning within NLP homogenizes multi-task architectures as a shared encoder and language model decoder, which does surprisingly well across a range of diverse tasks. Does this new architecture suffer from task-conflicts that require specialized training algorithms? We study how certain factors in the shift towards text-to-text models affects multi-task conflict and negative transfer, finding that both directional conflict and transfer are surprisingly constant across architectures.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.40966796875, 7.427639484405518], "id": 1774, "title": "MANTa: Efficient Gradient-Based Tokenization for End-to-End Robust Language Modeling", "authors": "Nathan Godey, Roman Castagn\u00e9, \u00c9ric de la Clergerie and Beno\u00eet Sagot", "abstract": "Static subword tokenization algorithms have been an essential component of recent works on language modeling. However, their static nature results in important flaws that degrade the models' downstream performance and robustness. In this work, we propose MANTa, a Module for Adaptive Neural TokenizAtion. MANTa is a differentiable tokenizer trained end-to-end with the language model. The resulting system offers a trade-off between the expressiveness of byte-level models and the speed of models trained using subword tokenization. In addition, our tokenizer is highly explainable since it produces an explicit segmentation of sequences into blocks. We evaluate our pre-trained model on several English datasets from different domains as well as on synthetic noise. We find that MANTa improves robustness to character perturbations and out-of-domain data. We then show that MANTa performs comparably to other models on the general-domain GLUE benchmark. Finally, we show that it is considerably faster than strictly byte-level models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.00194787979126, 4.52043342590332], "id": 1813, "title": "Calibrating Trust of Multi-Hop Question Answering Systems with Decompositional Probes", "authors": "Kaige Xie, Sarah Wiegreffe and Mark Riedl", "abstract": "Multi-hop Question Answering (QA) is a challenging task since it requires an accurate aggregation of information from multiple context paragraphs and a thorough understanding of the underlying reasoning chains. Recent work in multi-hop QA has shown that performance can be boosted by first decomposing the questions into simpler, single-hop questions. In this paper, we explore one additional utility of the multi-hop decomposition from the perspective of explainable NLP: to create explanation by probing a neural QA model with them. We hypothesize that in doing so, users will be better able to predict when the underlying QA system will give the correct answer. Through human participant studies, we verify that exposing the decomposition probes and answers to the probes to users can increase their ability to predict system performance on a question instance basis. We show that decomposition is an effective form of probing QA systems as well as a promising approach to explanation generation. In-depth analyses show the need for improvements in decomposition systems.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.68852424621582, 3.5797853469848633], "id": 1837, "title": "CheckHARD: Checking Hard Labels for Adversarial Text Detection, Prediction Correction, and Perturbed Word Suggestion", "authors": "Hoang-Quoc Nguyen-Son, Huy Quang Ung, Seira Hidano, Kazuhide Fukushima and Shinsaku Kiyomoto", "abstract": "An adversarial attack generates harmful text that fools a target model. More dangerously, this text is unrecognizable by humans. Existing work detects adversarial text and corrects a target's prediction by identifying perturbed words and changing them into their synonyms, but many benign words are also changed. In this paper, we directly detect adversarial text, correct the prediction, and suggest perturbed words by checking the change in the hard labels from the target's predictions after replacing a word with its transformation using a model that we call CheckHARD. The experiments demonstrate that CheckHARD outperforms existing work on various attacks, models, and datasets.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.902439117431641, 4.853776931762695], "id": 1842, "title": "Mitigating Covertly Unsafe Text within Natural Language Systems", "authors": "Alex Mei, Anisha Kabir, Sharon Levy, Melanie Subbiah, Emily Allaway, John N. Judge, Desmond Patton, Bruce Bimber, Kathleen McKeown and William Yang Wang", "abstract": "An increasingly prevalent problem for intelligent technologies is text safety, as uncontrolled systems may generate recommendations to their users that lead to injury or life-threatening consequences. However, the degree of explicitness of a generated statement that can cause physical harm varies. In this paper, we distinguish types of text that can lead to physical harm and establish one particularly underexplored category: covertly unsafe text. Then, we further break down this category with respect to the system's information and discuss solutions to mitigate the generation of text in each of these subcategories. Ultimately, our work defines the problem of covertly unsafe language that causes physical harm and argues that this subtle yet dangerous issue needs to be prioritized by stakeholders and regulators. We highlight mitigation strategies to inspire future researchers to tackle this challenging problem and help improve safety within smart systems.", "track": "Theme Track", "label": 18}, {"loc": [4.765920639038086, 8.699359893798828], "id": 1847, "title": "\"I Know Who You Are\u201d: Character-Based Features for Conversational Humor Recognition in Chinese", "authors": "Wenbo Shang, Jiangjiang Zhao, Zezhong WANG, Binyang Li, Fangchun Yang and Kam-Fai Wong", "abstract": "Humor plays an important role in our daily life, as it is an essential and fascinating element in the communication between persons. Therefore, how to recognize punchlines from the dialogue, i.e. conversational humor recognition, has attracted much interest of computational linguistics communities. However, most existing work attempted to understand the conversational humor by analyzing the contextual information of the dialogue, but neglected the character of the interlocutor, such as age, gender, occupation, and so on. For instance, the same utterance could bring out humorous from a serious person, but may be a plain expression from a naive person. To this end, this paper proposes a Character Fusion Conversational Humor Recognition model (CFCHR) to explore character information to recognize conversational humor. CFCHR utilizes a multi-task learning framework that unifies two highly pertinent tasks, i.e., character extraction and punchline identification. Based on deep neural networks, we trained both tasks jointly by sharing weight to extract the common and task-invariant features while each task could still learn its task-specific features. Experiments were conducted on Chinese sitcoms corpus, which consisted of 12,677 utterances from 22 characters. The experimental results demonstrated that CFCHR could achieve 33.08% improvements in terms of F1-score over some strong baselines, and proved the effectiveness of the character information to identify the punchlines.", "track": "NLP Applications", "label": 0}, {"loc": [6.306203842163086, 5.225409507751465], "id": 1854, "title": "DebiasGAN: Eliminating Position Bias in News Recommendation with Adversarial Learning", "authors": "Chuhan Wu, Fangzhao Wu, Xiangnan He and Yongfeng Huang", "abstract": "Click behaviors are widely used for learning news recommendation models, but they are heavily affected by the biases brought by the news display positions. It is important to remove position biases to train unbiased recommendation model and capture unbiased user interest. In this paper, we propose a news recommendation method named DebiasGAN that can effectively alleviate position biases via adversarial learning. The core idea is modeling the personalized effect of position bias on click behaviors in a candidate-aware way, and learning debiased candidate-aware user embeddings from which the position information cannot be discriminated. More specifically, we use a bias-aware click model to capture the effect of position bias on click behaviors, and use a bias-invariant click model with random candidate positions to estimate the ideally unbiased click scores. We apply adversarial learning to the embeddings learned by the two models to help the bias-invariant click model capture debiased user interest. Experimental results on two real-world datasets show that DebiasGAN effectively improves news recommendation by eliminating position biases.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.7355446815490723, 9.935484886169434], "id": 1869, "title": "Generating Multiple-Length Summaries via Reinforcement Learning for Unsupervised Sentence Summarization", "authors": "Dongmin Hyun, Xiting Wang, Chayoung Park, Xing Xie and Hwanjo Yu", "abstract": "Sentence summarization shortens given texts while maintaining core contents of the texts. Unsupervised approaches have been studied to summarize texts without ground-truth summaries. However, recent unsupervised models are extractive, which remove words from texts and thus they are less flexible than abstractive summarization. In this work, we devise an abstractive model based on reinforcement learning without ground-truth summaries. We formulate the unsupervised summarization based on the Markov decision process with rewards representing the summary quality. To further enhance the summary quality, we develop a multi-summary learning mechanism that generates multiple summaries with varying lengths for a given text, while making the summaries mutually enhance each other. Experimental results show that the proposed model substantially outperforms both abstractive and extractive models, yet frequently generating new words not contained in input texts.", "track": "Summarization", "label": 14}, {"loc": [9.379859924316406, 6.294040679931641], "id": 1873, "title": "Multilingual Sentence Transformer as A Multilingual Word Aligner", "authors": "Weikang Wang, Guanhua Chen, Hanqing Wang, Yue Han and Yun Chen", "abstract": "Multilingual pretrained language models (mPLMs) have shown their effectiveness in multilingual word alignment induction. However, these methods usually start from mBERT or XLM-R. In this paper, we investigate whether multilingual sentence Transformer LaBSE is a strong multilingual word aligner. This idea is non-trivial as LaBSE is trained to learn language-agnostic sentence-level embeddings, while the alignment extraction task requires the more fine-grained word-level embeddings to be language-agnostic. We demonstrate that the vanilla LaBSE outperforms other mPLMs currently used in the alignment task, and then propose to finetune LaBSE on parallel corpus for further improvement. Experiment results on seven language pairs show that our best aligner outperforms previous state-of-the-art models of all varieties. In addition, our aligner supports different language pairs in a single model, and even achieves new state-of-the-art on zero-shot language pairs that does not appear in the finetuning process.", "track": "Multilinguality", "label": 13}, {"loc": [5.950556755065918, 5.130181312561035], "id": 1879, "title": "CORE: A Retrieve-then-Edit Framework for Counterfactual Data Generation", "authors": "Tanay Dixit, Bhargavi Paranjape, Hannaneh Hajishirzi and Luke Zettlemoyer", "abstract": "Counterfactual data augmentation (CDA) -- i.e., adding minimally perturbed inputs during training -- helps reduce model reliance on spurious correlations and improves generalization to out-of-distribution (OOD) data. Prior work on generating counterfactuals only considered restricted classes of perturbations, limiting their effectiveness. We present Counterfactual Generation via Retrieval and Editing (CORE), a retrieval-augmented generation framework for creating diverse counterfactual perturbations for CDA. For each training example, CORE first performs a dense retrieval over a task-related unlabeled text corpus using a learned bi-encoder and extracts relevant counterfactual excerpts. CORE then incorporates these into prompts to a large language model with few-shot learning capabilities, for counterfactual editing. Conditioning language model edits on naturally occurring data results in more diverse perturbations. Experiments on natural language inference and sentiment analysis benchmarks show that CORE counterfactuals are more effective at improving generalization to OOD data compared to other DA approaches. We also show that the CORE retrieval framework can be used to encourage diversity in manually authored perturbations.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.182408332824707, 7.435129165649414], "id": 1885, "title": "Conversation Disentanglement with Bi-Level Contrastive Learning", "authors": "Chengyu Huang, Zheng Zhang, Hao Fei and Lizi Liao", "abstract": "Conversation disentanglement aims to group utterances into detached sessions, which is a fundamental task in processing multi-party conversations. Existing methods have two main drawbacks. First, they overemphasize pairwise utterance relations but pay inadequate attention to the utterance-to-context relation modeling. Second, huge amount of human annotated data is required for training, which is expensive to obtain in practice. To address these issues, we propose a general disentangle model based on bi-level contrastive learning. It brings closer utterances in the same session while encourages each utterance to be near its clustered session prototypes in the representation space. Unlike existing approaches, our disentangle model works in both supervised setting with labeled data and unsupervised setting when no such data is available. The proposed method achieves new state-of-the-art performance on both settings across several public datasets.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.9653910398483276, 4.0140252113342285], "id": 1887, "title": "You can't pick your neighbors, or can you? When and How to Rely on Retrieval in the kNN-LM", "authors": "Andrew Drozdov, Shufan Wang, Razieh Rahimi, Andrew McCallum, Hamed Zamani and Mohit Iyyer", "abstract": "Retrieval-enhanced language models (LMs), which condition their predictions on text retrieved from large external datastores, have recently shown significant perplexity improvements compared to standard LMs. One such approach, the kNN-LM, interpolates any existing LM's predictions with the output of a k-nearest neighbors model and requires no additional training. In this paper, we explore the importance of lexical and semantic matching in the context of items retrieved by kNN-LM. We find two trends: (1) the presence of large overlapping n-grams between the datastore and evaluation set plays an important factor in strong performance, even when the datastore is derived from the training data; and (2) the kNN-LM is most beneficial when retrieved items have high semantic similarity with the query. Based on our analysis, we define a new formulation of the kNN-LM that uses retrieval quality to assign the interpolation coefficient. We empirically measure the effectiveness of our approach on two English language modeling datasets, Wikitext-103 and PG-19. Our re-formulation of the kNN-LM is beneficial in both cases, and leads to nearly 4% improvement in perplexity on the Wikitext-103 test set.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.174302577972412, 7.114230632781982], "id": 1889, "title": "StuBot: Learning by Teaching a Conversational Agent Through Machine Reading Comprehension", "authors": "Nayoung Jin and Hana Lee", "abstract": "This paper proposes StuBot, a text-based conversational agent that provides adaptive feedback for learning by teaching. StuBot first asks the users to teach the learning content by summarizing and explaining it in their own words. After the users inputted the explanation text for teaching, StuBot uses a machine reading comprehension (MRC) engine to provide adaptive feedback with further questions about the insufficient parts of the explanation text. We conducted a within-subject study to evaluate the effectiveness of adaptive feedback by StuBot. Both the quantitative and qualitative results showed that learning by teaching with adaptive feedback can improve learning performance, immersion, and overall experience.", "track": "NLP Applications", "label": 0}, {"loc": [8.0657958984375, 5.7072038650512695], "id": 1893, "title": "Improved Universal Sentence Embeddings with Prompt-based Contrastive Learning and Energy-based Learning", "authors": "Yuxin Jiang, Linhan Zhang and Wei Wang", "abstract": "Contrastive learning has been demonstrated to be effective in enhancing pre-trained language models (PLMs) to derive superior universal sentence embeddings. However, existing contrastive methods still have two limitations. Firstly, previous works may acquire poor performance under domain shift settings, thus hindering the application of sentence representations in practice. We attribute this low performance to the over-parameterization of PLMs with millions of parameters. To alleviate it, we propose PromCSE (Prompt-based Contrastive Learning for Sentence Embeddings), which only trains small-scale Soft Prompt (i.e., a set of trainable vectors) while keeping PLMs fixed. Secondly, the commonly used NT-Xent loss function of contrastive learning does not fully exploit hard negatives in supervised learning settings. To this end, we propose to integrate an Energy-based Hinge loss to enhance the pairwise discriminative power, inspired by the connection between the NT-Xent loss and the Energy-based Learning paradigm. Empirical results on seven standard semantic textual similarity (STS) tasks and a domain-shifted STS task both show the effectiveness of our method compared with the current state-of-the-art sentence embedding models.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.753551483154297, 11.742570877075195], "id": 1907, "title": "RaP: Redundancy-aware Video-language Pre-training for Text-Video Retrieval", "authors": "Xing Wu, Chaochen Gao, Zijia Lin, Zhongyuan Wang, Jizhong Han and Songlin Hu", "abstract": "Video language pre-training methods have mainly adopted sparse sampling techniques to alleviate the temporal redundancy of videos. Though effective, sparse sampling still suffers inter-modal redundancy: visual redundancy and textual redundancy. Compared with highly generalized text, sparsely sampled frames usually contain text-independent portions, called visual redundancy. Sparse sampling is also likely to miss important frames corresponding to some text portions, resulting in textual redundancy. Inter-modal redundancy leads to a mismatch of video and text information, hindering the model from better learning the shared semantics across modalities. To alleviate it, we propose Redundancy-aware Video-language Pre-training. We design a redundancy measurement of video patches and text tokens by calculating the cross-modal minimum dis-similarity. Then, we penalize the high-redundant video patches and text tokens through a proposed redundancy-aware contrastive learning. We evaluate our method on four benchmark datasets, MSRVTT, MSVD, DiDeMo, and LSMDC, achieving a significant improvement over the previous state-of-the-art results.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.157038688659668, 7.585496425628662], "id": 1910, "title": "FCGCL: Fine- and Coarse-Granularity Contrastive Learning for Speech Translation", "authors": "Hao Zhang, Nianwen Si, Yaqi Chen, Zhen Li, Tong Niu, Xukui Yang and Dan Qu", "abstract": "It is notoriously difficult to implement end-to-end speech translation (E2E-ST) model because of the task complexity and data scarcity. Existing techniques often attempt to carry out implicit knowledge transfer from machine translation (MT) to ST model by imposing various constraints. However, in this transfer scenario, a significant problem is that the performance of the MT will drop significantly and the final transfer effect is also restricted. In this article, we recommend Fine and Coarse Granularity Contrastive Learning (FCGCL), which conduct explicit knowledge transfer from MT to ST model. Specially, we ensure through multi granularity contrastive learning that inputs with similar semantic between different modalities are encoded closely in the shared semantic space while inputs with different semantics are kept apart. Experiments on the MuST-C datasets on all 8 languages and further analysis show that our method can effectively improve the E2E-ST performance and achieves an average BLEU of 29.0.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.992631912231445, 5.678435802459717], "id": 1917, "title": "InfoCSE: Information-aggregated Contrastive Learning of Sentence Embeddings", "authors": "Xing Wu, Chaochen Gao, Zijia Lin, Jizhong Han, Zhongyuan Wang and Songlin Hu", "abstract": "Contrastive learning has been extensively studied in sentence embedding learning, which assumes that the embeddings of different views of the same sentence are closer. The constraint brought by this assumption is weak, and a good sentence representation should also be able to reconstruct the original sentence fragments. Therefore, this paper proposes an information-aggregated contrastive learning framework for learning unsupervised sentence embeddings, termed InfoCSE.InfoCSE forces the representation of [CLS] positions to aggregate denser sentence information by introducing an additional Masked language model task and a well-designed network. We evaluate the proposed InfoCSE on several benchmark datasets w.r.t the semantic text similarity (STS) task. Experimental results show that InfoCSE outperforms SimCSE by an average Spearman correlation of 2.60% on BERT-base, and 1.77% on BERT-large, achieving state-of-the-art results among unsupervised sentence representation learning methods.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.453577041625977, 1.925446629524231], "id": 1929, "title": "Benchmarking Language Models for Code Syntax Understanding", "authors": "Da Shen, Xinyun Chen, Chenguang Wang, Koushik Sen and Dawn Song", "abstract": "Pre-trained language models have demonstrated impressive performance in both natural language processing and program understanding, which represent the input as a token sequence without explicitly modeling its structure. Some prior works show that pre-trained language models can capture the syntactic rules of natural languages without finetuning on syntax understanding tasks. However, there is limited understanding of how well pre-trained models understand the code structure so far. In this work, we perform the first thorough benchmarking of the state-of-the-art pre-trained models for identifying the syntactic structures of programs. Specifically, we introduce CodeSyntax, a large-scale dataset of programs annotated with the syntactic relationships in their corresponding abstract syntax trees. Our key observation is that pre-training on massive code data does not result in decent code syntax understanding. In fact, these pre-trained programming language models fail to match the performance of naive baselines based on positional offsets and keywords. \nWe also present a natural language benchmark to highlight the differences between natural languages and programming languages in terms of understanding corresponding syntactic structures. Our findings point out key limitations of existing pre-training methods and suggest the importance of modeling syntactic structures for the programming language.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.8505043983459473, 7.242842674255371], "id": 1931, "title": "Learning When and What to Quote: A Quotation Recommender System with Mutual Promotion of Recommendation and Generation", "authors": "Lingzhi Wang, Xingshan Zeng and Kam-Fai Wong", "abstract": "This work extends the current quotation recommendation task to a more realistic quotation recommender system that learns to predict when to quote and what to quote jointly. The system consists of three modules (tasks), a prediction module to predict whether to quote given conversation contexts, a recommendation module to recommend suitable quotations and a generation module generating quotations or sentences in ordinary language to continue the conversation. We benchmark several competitive models for the two newly introduced tasks (i.e., when-to-quote and what-to-continue). For quotation recommendation, compared with previous work that is either generation-based or ranking-based recommendation, we propose a novel framework with mutual promotion of generation module and ranking-based recommendation module. Experiments show that our framework achieves significantly better performance than baselines on two datasets. Further experiments and analyses validate the effectiveness of the proposed mechanisms and get a better understanding of the quotation recommendation task.", "track": "NLP Applications", "label": 0}, {"loc": [5.124294281005859, 12.127364158630371], "id": 1933, "title": "Think Beyond Words: Exploring Context-Relevant Visual Commonsense for Diverse Dialogue Generation", "authors": "Yiting Liu, Liang Li, Beichen Zhang and Qingming Huang", "abstract": "Commonsense knowledge has been widely considered for building intelligent open-domain dialogue agents, aiming to generate meaningful and diverse responses. Previous works in this field usually lack the ability to effectively obtain and utilize auxiliary commonsense from the external visual world. In this paper, we argue that exploiting logical information in images related to context can be effective to enrich and steer the generation process. In view of this, we propose VICTOR, a context-relevant VIsual Commonsense enhanced dialogue generaTOR for generating coherent and informative responses. To obtain the associated visual commonsense, we devise a novel approach that expands topic words on the knowledge graph and maps them into daily scenarios. During the generation, the model adopts multimodal fusion mechanism to integrate visual and textual information, and adaptively combine their decoding distributions for better response generation. The experimental results on two public datasets show that our proposed method outperforms the latest competitive methods in terms of coherence and diversity.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [6.447525978088379, 5.617816925048828], "id": 1938, "title": "Gender Bias in Meta-Embeddings", "authors": "Masahiro Kaneko, Danushka Bollegala and Naoaki Okazaki", "abstract": "Different methods have been proposed to develop meta-embeddings from a given set of source embeddings. \nHowever, the source embeddings can contain unfair gender-related biases, and how these influence the meta-embeddings has not been studied yet.\nWe study the gender bias in meta-embeddings created under three different settings:\n(1) meta-embedding multiple sources without performing any debiasing (Multi-Source No-Debiasing),\n(2) meta-embedding multiple sources debiased by a single method (Multi-Source Single-Debiasing), and\n(3) meta-embedding a single source debiased by different methods (Single-Source Multi-Debiasing).\nOur experimental results show that meta-embedding amplifies the gender biases compared to input source embeddings.\nWe find that debiasing not only the sources but also their meta-embedding is needed to mitigate those biases.\nMoreover, we propose a novel debiasing method based on meta-embedding learning where we use multiple debiasing methods on a single source embedding and then create a single unbiased meta-embedding.", "track": "Ethics", "label": 21}, {"loc": [9.622785568237305, 6.466322898864746], "id": 1941, "title": "Third-Party Aligner for Neural Word Alignments", "authors": "Jinpeng Zhang, Chuanqi Dong, Xiangyu Duan, Yuqi Zhang and Min Zhang", "abstract": "Word alignment is to find translationally equivalent words between source and target sentences. Previous work has demonstrated that self-training can achieve competitive word alignment results. In this paper, we propose to use word alignments generated by a third-party word aligner to supervise the neural word alignment training. Specifically, source word and target word of each word pair aligned by the third-party aligner are trained to be close neighbors to each other in the contextualized embedding space when fine-tuning a pre-trained cross-lingual language model. Experiments on the benchmarks of various language pairs show that our approach can surprisingly do self-correction over the third-party supervision by finding more accurate word alignments and deleting wrong word alignments, leading to better performance than various third-party word aligners, including the currently best one. When we integrate all supervisions from various third-party aligners, we achieve state-of-the-art word alignment performances, with averagely more than two points lower alignment error rates than the best third-party aligner.We released our code at https://github.com/sdongchuanqi/Third-Party-Supervised-Aligner.", "track": "Multilinguality", "label": 13}, {"loc": [2.778637647628784, 4.7255377769470215], "id": 1966, "title": "QaDialMoE: Question-answering Dialogue based Fact Verification with Mixture of Experts", "authors": "Longzheng Wang, Peng Zhang, Xiaoyu Sean Lu, Lei Zhang, Chaoyang Yan and Chuang Zhang", "abstract": "Fact verification is an essential tool to mitigate the spread of false information online, which has gained a widespread attention recently. However, a fact verification in the question-answering dialogue is still underexplored. In this paper, we propose a neural network based approach called question-answering dialogue based fact verification with mixture of experts (QaDialMoE). It exploits questions and evidence effectively in the verification process and can significantly improve the performance of fact verification. Specifically, we exploit the mixture of experts to focus on various interactions among responses, questions and evidence. A manager with an attention guidance module is implemented to guide the training of experts and assign a reasonable attention score to each expert. A prompt module is developed to generate synthetic questions that make our approach more generalizable. Finally, we evaluate the QaDialMoE and conduct a comparative study on three benchmark datasets. The experimental results demonstrate that our QaDialMoE outperforms previous approaches by a large margin and achieves new state-of-the-art results on all benchmarks. This includes the accuracy improvements on the HEALTHVER as 84.26%, the FAVIQ A dev set as 78.7%, the FAVIQ R dev set as 86.1%, test set as 86.0%, and the COLLOQUIAL as 89.5%. To our best knowledge, this is the first work to investigate a question-answering dialogue based fact verification, and achieves new state-of-the-art results on various benchmark datasets.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.5833516120910645, 12.74416446685791], "id": 1970, "title": "Multimodal Knowledge Learning for Named Entity Disambiguation", "authors": "Zhang Dongjie and Longtao Huang", "abstract": "With the popularity of online social media, massive-scale multimodal information has brought new challenges to traditional Named Entity Disambiguation (NED) tasks. Recently, Multimodal Named Entity Disambiguation (MNED) has been proposed to link ambiguous mentions with the textual and visual contexts to a predefined knowledge graph. Existing attempts usually perform MNED by annotating multimodal mentions and adding multimodal features to traditional NED models. However, these studies may suffer from 1) failing to model multimodal information at the knowledge level, and 2) lacking multimodal annotation data against the large-scale unlabeled corpus. In this paper, we explore a pioneer study on leveraging multimodal knowledge learning to address the MNED task. Specifically, we first harvest multimodal knowledge in the Meta-Learning way, which is much easier than collecting ambiguous mention corpus. Then we design a knowledge-guided transfer learning strategy to extract unified representation from different modalities. Finally, we propose an Interactive Multimodal Learning Network (IMN) to fully utilize the multimodal information on both the mention and knowledge sides. Extensive experiments conducted on two public MNED datasets demonstrate that the proposed method achieves improvements over the state-of-the-art multimodal methods.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [0.8930509090423584, 8.156498908996582], "id": 1977, "title": "Generative Prompt Tuning for Relation Classification", "authors": "Jiale Han, Shuai Zhao, Bo Cheng, shengkun ma and Wei Lu", "abstract": "Using prompts to explore the knowledge contained within pre-trained language models for downstream tasks has now become an active topic. Current prompt tuning methods mostly convert the downstream tasks to masked language modeling problems by adding cloze-style phrases and mapping all labels to verbalizations with fixed length, which has proven effective for tasks with simple label spaces. However, when applied to relation classification exhibiting complex label spaces, vanilla prompt tuning methods may struggle with label verbalizations with arbitrary lengths due to rigid prompt restrictions. Inspired by the text infilling task for pre-training generative models that can flexibly predict missing spans, we propose a novel generative prompt tuning method to reformulate relation classification as an infilling problem, which frees our approach from limitations of current prompt based approaches and thus fully exploits rich semantics of entity and relation types. In addition, we design entity-guided decoding and discriminative relation scoring to generate and align relations effectively and efficiently during inference. Extensive experiments under fully supervised settings and low-resource settings demonstrate the effectiveness of our approach.", "track": "Information Extraction", "label": 5}, {"loc": [1.7668083906173706, 8.903051376342773], "id": 1993, "title": "Formulating Few-shot Fine-tuning Towards Language Model Pre-training: A Pilot Study on Named Entity Recognition", "authors": "Zihan Wang, Kewen Zhao, Zilong Wang and Jingbo Shang", "abstract": "Fine-tuning pre-trained language models is a common practice in building NLP models for various tasks, including the case with less supervision. We argue that under the few-shot setting, formulating fine-tuning closer to the pre-training objective shall be able to unleash more benefits from the pre-trained language models. In this work, we take few-shot named entity recognition (NER) for a pilot study, where existing fine-tuning strategies are much different from pre-training. We propose a novel few-shot fine-tuning framework for NER, FFF-NER. Specifically, we introduce three new types of tokens, \"is-entity\", \"which-type\" and \"bracket\", so we can formulate the NER fine-tuning as (masked) token prediction or generation, depending on the choice of the pre-training objective. In our experiments, we apply \\our to fine-tune both BERT and BART for few-shot NER on several benchmark datasets and observe significant improvements over existing fine-tuning strategies, including sequence labeling, prototype meta-learning, and prompt-based approaches. We further perform a series of ablation studies, showing few-shot NER performance is strongly correlated with the similarity between fine-tuning and pre-training.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.570583343505859, 3.2198073863983154], "id": 1996, "title": "Masked Language Models Know Which are Popular: A Simple Ranking Strategy for Commonsense Question Answering", "authors": "Xuan Luo, Chuang Fan, Yice Zhang, Wanguo Jiang, Bing Qin and Ruifeng Xu", "abstract": "We propose a simple ranking strategy to solve a generative commonsense question answering (QA) problem. Compared with multiple-choice QA, it is challenging because the answers to a question are not unique and they are supposed to be popular and diverse. Our strategy exploits the dataset itself and negative samples that we collect from WordNet to train a ranker that picks out the most popular answers for commonsense questions. The effectiveness of our strategy is verified on different pre-trained masked language models (MLMs) in a pipeline framework, where an MLM reranks the generated answers. Further, we explore an end-to-end framework where MLMs are utilized to guide the generation of generative language models (GLMs). Taking advantage of reinforcement learning, we apply policy gradient to train a GLM with the rewards fed back by an MLM. Empirical results on ProtoQA dataset demonstrate that MLMs can acquire the ability to distinguish the popular answers and improve the typical answer generation of GLMs as well.", "track": "Question Answering", "label": 11}, {"loc": [4.109504222869873, 7.391895771026611], "id": 1998, "title": "DialogUSR: Complex Dialogue Utterance Splitting and Reformulation for Multiple Intent Detection", "authors": "Haoran Meng, Zheng Xin, Tianyu Liu, Zizhen Wang, he feng, Binghuai Lin, Xuemin Zhao, Yunbo Cao and Zhifang Sui", "abstract": "While interacting with chatbots, users may elicit multiple intents in a single dialogue utterance. Instead of training a dedicated multi-intent detection model, we propose DialogUSR, a dialogue utterance splitting and reformulation task that first splits multi-intent user query into several single-intent sub-queries and then recovers all the coreferred and omitted information in the sub-queries. DialogUSR can serve as a plug-in and domain-agnostic module that empowers the multi-intent detection for the deployed chatbots with minimal efforts. We collect a high-quality naturally occurring dataset that covers 23 domains with a multi-step crowd-souring procedure. To benchmark the proposed dataset, we propose multiple action-based generative models that involve end-to-end and two-stage training, and conduct in-depth analyses on the pros and cons of the proposed baselines.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.978288173675537, 9.156038284301758], "id": 2000, "title": "Low-resource Interactive Active Labeling for Fine-tuning Language Models", "authors": "Seiji Maekawa, Dan Zhang, Hannah Kim, Sajjadur Rahman and Estevam Hruschka", "abstract": "Recently, active learning (AL) methods have been used to effectively fine-tune pre-trained language models for various NLP tasks such as sentiment analysis and document classification. However, given the task of fine-tuning language models, understanding the impact of different aspects on AL methods such as labeling cost, sample acquisition latency, and the diversity of the datasets necessitates a deeper investigation. This paper examines the performance of existing AL methods within a low-resource, interactive labeling setting. We observe that existing methods often underperform in such a setting while exhibiting higher latency and a lack of generalizability. To overcome these challenges, we propose a novel active learning method TYROUGE that employs a hybrid sampling strategy to minimize labeling cost and acquisition latency while providing a framework for adapting to dataset diversity via user guidance. Through our experiments, we observe that compared to SOTA methods, TYROUGE reduces the labeling cost by up to 43% and the acquisition latency by as much as 11X, while achieving comparable accuracy. Finally, we discuss the strengths and weaknesses of TYROUGE by exploring the impact of dataset characteristics.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.165827751159668, 5.287082672119141], "id": 2005, "title": "Getting the Most out of Simile Recognition", "authors": "Xiaoyue Wang, Linfeng Song, Xin Liu, Chulun Zhou, Hualin Zeng and Jinsong Su", "abstract": "Simile recognition involves two subtasks: simile sentence classification that discriminates whether a sentence contains simile, and simile component extraction that locates the corresponding objects (i.e., tenors and vehicles).\nRecent work ignores features other than surface strings and suffers from the data hunger issue.\nWe explore expressive features for this task to help achieve more effective data utilization.\nIn particular, we study two types of features: 1) input-side features that include POS tags, dependency trees and word definitions, and 2) decoding features that capture the interdependence among various decoding decisions.\nWe further construct a model named HGSR, which merges the input-side features as a heterogeneous graph and leverages decoding features via distillation.\nExperiments show that HGSR significantly outperforms the current state-of-the-art systems and carefully designed baselines, verifying the effectiveness of introduced features. We will release our code upon paper acceptance.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.148991584777832, 8.765201568603516], "id": 2006, "title": "A Unified Framework for Pun Generation with Humor Principles", "authors": "Yufei Tian, Divyanshu Arun Sheth and Nanyun Peng", "abstract": "We propose a unified framework to generate both homophonic and homographic puns to resolve the split-up in existing works. Specifically, we incorporate three linguistic attributes of puns to the language models: ambiguity, distinctiveness, and surprise. Our framework consists of three parts: 1) a context words/phrases selector to promote the aforementioned attributes, 2) a generation model trained on non-pun sentences to incorporate the context words/phrases into the generation output, and 3) a label predictor that learns the structure of puns which is used to steer the generation model at inference time. \nEvaluation results on both pun types demonstrate the efficacy of our model over strong baselines.", "track": "Natural Language Generation", "label": 6}, {"loc": [9.56695556640625, 6.545870780944824], "id": 2007, "title": "Improving English-Arabic Transliteration with Phonemic Memories", "authors": "Yuanhe Tian, Renze Lou, Xiangyu Pang, Lianxi Wang, Shengyi JIANG and Yan Song", "abstract": "Transliteration is an important task in natural language processing (NLP) which aims to convert a name in the source language to the target language without changing its pronunciation. Particularly, transliteration from English to Arabic is highly needed in many applications, especially in countries (e.g., United Arab Emirates (UAE)) whose most citizens are foreigners but the official language is Arabic. In such a task-oriented scenario, namely transliterating the English names to the corresponding Arabic ones, the performance of the transliteration model is highly important. However, most existing neural approaches mainly apply a universal transliteration model with advanced encoders and decoders to the task, where limited attention is paid to leveraging the phonemic association between English and Arabic to further improve model performance. In this paper, we focus on transliteration of people's names from English to Arabic for the general public. In doing so, we collect a corpus named EANames by extracting high quality name pairs from online resources which better represent the names in the general public than linked Wikipedia entries that are always names of famous people). We propose a model for English-Arabic transliteration, where a memory module modeling the phonemic association between English and Arabic is used to guide the transliteration process. We run experiments on the collected data and the results demonstrate the effectiveness of our approach for English-Arabic transliteration.", "track": "Machine Translation", "label": 10}, {"loc": [4.154089450836182, 7.382443904876709], "id": 2054, "title": "Mix-and-Match: Scalable Dialog Response Retrieval using Gaussian Mixture Embeddings", "authors": "Gaurav Pandey, Danish Contractor and Sachindra Joshi", "abstract": "Embedding-based approaches for dialog response retrieval embed the context-response pairs as points in the embedding space. These approaches are scalable, but fail to account for the complex, many-to-many relationships that exist between context-response pairs. \nOn the other end of the spectrum, there are approaches that feed the context-response pairs jointly through multiple layers of neural networks. These approaches can model the complex relationships between context-response pairs, but fail to scale when the set of responses is moderately large (>1000). \nIn this paper, we propose a scalable model that can learn complex relationships between context-response pairs. Specifically, the model maps the contexts as well as responses to probability distributions over the embedding space. We train the models by optimizing the Kullback-Leibler divergence between the distributions induced by context-response pairs in the training data. We show that the resultant model achieves better performance as compared to other embedding-based approaches on publicly available conversation data.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.802298545837402, 8.294027328491211], "id": 2074, "title": "AlphaTuning: Quantization-Aware Parameter-Efficient Adaptation of Large-Scale Pre-Trained Language Models", "authors": "Se Jung Kwon, Jeonghoon Kim, Jeongin Bae, Kang Min Yoo, Jin-Hwa Kim, Baeseong Park, Byeongwook Kim, Jung-Woo Ha, Nako Sung and Dongsoo Lee", "abstract": "There are growing interests in adapting large-scale language models using parameter-efficient fine-tuning methods. However, accelerating the model itself and achieving better inference efficiency through model compression has not been thoroughly explored yet.\nModel compression could provide the benefits of reducing memory footprints, enabling low-precision computations, and ultimately achieving cost-effective inference.\nTo combine parameter-efficient adaptation and model compression, we propose AlphaTuning consisting of post-training quantization of the pre-trained language model and fine-tuning only some parts of quantized parameters for a target task.\nSpecifically, AlphaTuning works by employing binary-coding quantization, which factorizes the full-precision parameters into binary parameters and a separate set of scaling factors.\nDuring the adaptation phase, the binary values are frozen for all tasks, while the scaling factors are fine-tuned for the downstream task.\nWe demonstrate that AlphaTuning, when applied to GPT-2 and OPT, performs competitively with full fine-tuning on a variety of downstream tasks while achieving >10x compression ratio under 4-bit quantization and >1,000x reduction in the number of trainable parameters.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [3.3662941455841064, 4.050569534301758], "id": 2086, "title": "Learning Invariant Representation Improves Robustness for MRC Models", "authors": "Hai Yu, Liang Wen, Haoran Meng, Tianyu Liu and Houfeng Wang", "abstract": "The prosperity of Pretrained Language Models(PLM) has greatly promoted the development of Machine Reading Comprehension (MRC). However, these models are vulnerable and not robust to adversarial examples. In this paper, we propose Stable and Contrastive Question Answering (SCQA) to improve invariance of representation to alleviate these robustness issues. Specifically, we first construct positive example pairs which have same answer through data augmentation. Then SCQA learns enhanced representations with better alignment between positive pairs by introducing stability and contrastive loss. Experimental results show that our approach can boost the robustness of QA models cross different MRC tasks and attack sets significantly and consistently.", "track": "Question Answering", "label": 11}, {"loc": [5.430537700653076, 5.07767391204834], "id": 2097, "title": "ER-Test: Evaluating Explanation Regularization Methods for Language Models", "authors": "Brihi Joshi, Aaron Z. Chan, Ziyi Liu, Shaoliang Nie, Maziar Sanjabi, Hamed Firooz and Xiang Ren", "abstract": "By explaining how humans would solve a given task, human rationales can provide strong learning signal for neural language models (NLMs). Explanation regularization (ER) aims to improve NLM generalization by pushing the NLM's machine rationales (Which input tokens did the NLM focus on?) to align with human rationales (Which input tokens would humans focus on). Though prior works primarily study ER via in-distribution (ID) evaluation, out-of-distribution (OOD) generalization is often more critical in real-world scenarios, yet ER's effect on OOD generalization has been underexplored.\n\nIn this paper, we introduce ER-Test, a framework for evaluating ER models' OOD generalization along three dimensions: unseen datasets, contrast set tests, and functional tests. Using ER-Test, we comprehensively analyze how ER models' OOD generalization varies with the rationale alignment criterion (loss function), human rationale type (instance-level v/s task-level), number and choice of rationale-annotated instances, and time budget for rationale annotation. Across two tasks and six datasets, we show that ER has little impact on ID performance but yields large OOD performance gains, with the best ER criterion being task-dependent. Also, ER can improve OOD performance even with task-level or few human rationales. Finally, we find that rationale annotation is more time-efficient than label annotation for improving OOD performance. Our results with ER-Test help demonstrate ER's utility and establish best practices for using ER effectively.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.0935375690460205, 10.570944786071777], "id": 2108, "title": "Learning Cooperative Interactions for Multi-Overlap Aspect Sentiment Triplet Extraction", "authors": "Shiman Zhao, Wei Chen and Tengjiao Wang", "abstract": "Aspect sentiment triplet extraction (ASTE) is an essential task, which aims to extract triplets\n(aspect, opinion, sentiment). However, overlapped triplets, especially multi-overlap triplets,\nmake ASTE a challenge. Most existing methods suffer from multi-overlap triplets because\nthey focus on the single interactions between an aspect and an opinion. To solve the above\nissues, we propose a novel multi-overlap triplet extraction method, which decodes the complex\nrelations between multiple aspects and opinions by learning their cooperative interactions. Overall, the method is based on an encoder-decoder architecture. During decoding, we design a\njoint decoding mechanism, which employs a multi-channel strategy to generate aspects and\nopinions through the cooperative interactions between them jointly. Furthermore, we construct\na correlation-enhanced network to reinforce the interactions between related aspects\nand opinions for sentiment prediction. Besides, a relation-wise calibration scheme is adopted\nto further improve performance. Experiments show that our method outperforms baselines,\nespecially multi-overlap triplets.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.797540664672852, 8.396204948425293], "id": 2111, "title": "Different Tunes Played with Equal Skill: Exploring a Unified Optimization Subspace for Parameter-Efficient Tuning", "authors": "Jing Yi, Weize Chen, Yujia Qin, Yankai Lin, Ning Ding, Xu Han, Zhiyuan Liu, Maosong Sun and Jie Zhou", "abstract": "Delta tuning (DET, also known as parameter-efficient tuning) is deemed as the new paradigm for using pre-trained language models (PLMs). Up to now, various DETs with distinct design elements have been proposed, achieving performance on par with fine-tuning. However, the mechanisms behind the above success are still under-explored, especially the connections among various DETs. To fathom the mystery, we hypothesize that the adaptations of different DETs could all be reparameterized as low-dimensional optimizations in a unified optimization subspace, which could be found by jointly decomposing independent solutions of different DETs. Then we explore the connections among different DETs by conducting optimization within the subspace. In experiments, we find that, for a certain DET, conducting optimization simply in the subspace could achieve comparable performance to its original space, and the found solution in the subspace could be transferred to another DET and achieve non-trivial performance. We also visualize the performance landscape of the subspace, and find that, there exists a substantial region where different DETs all perform well. Finally, we extend our analysis and show the strong connections between fine-tuning and DETs. The codes are publicly available at https://github.com/thunlp/Unified-DeltaTuning.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.896711826324463, 9.583019256591797], "id": 2113, "title": "Explainable Slot Type Attentions to Improve Joint Intent Detection and Slot Filling", "authors": "Kalpa Gunaratna, Vijay Srinivasan, Akhila Yerukola and Hongxia Jin", "abstract": "Joint intent detection and slot filling is a key research topic in natural language understanding (NLU). Existing joint intent and slot filling systems analyze and compute features collectively for all slot types, and importantly, have no way to explain the slot filling model decisions. In this work, we propose a novel approach that: (i) learns to generate additional slot type specific features in order to improve accuracy and (ii) provides explanations for slot filling decisions for the first time in a joint NLU model. We perform an additional constrained supervision using a set of binary classifiers for the slot type specific feature learning, thus ensuring appropriate attention weights are learned in the process to explain slot filling decisions for utterances. Our model is inherently explainable and does not need any post-hoc processing. We evaluate our approach on two widely used datasets and show accuracy improvements. Moreover, a detailed analysis is also provided for the exclusive slot explainability.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.861375331878662, 3.2929584980010986], "id": 2138, "title": "PseudoReasoner: Leveraging Pseudo Labels for Commonsense Knowledge Base Population", "authors": "Tianqing Fang, Quyet V. Do, Hongming Zhang, Yangqiu Song, Ginny Y. Wong and Simon See", "abstract": "Commonsense Knowledge Base (CSKB) Population aims at reasoning over unseen entities and assertions on CSKBs, and is an important yet hard commonsense reasoning task. One challenge is that it requires out-of-domain generalization ability as the source CSKB for training is of a relatively smaller scale (1M) while the whole candidate space for population is way larger (200M). We propose PseudoReasoner, a semi-supervised learning framework for CSKB population that uses a teacher model pre-trained on CSKBs to provide pseudo labels on the unlabeled candidate dataset for a student model to learn from. \nThe teacher can be a generative model rather than restricted to discriminative models as previous works.\nIn addition, we design a new filtering procedure for pseudo labels based on influence function and the student model's prediction to further improve the performance. The framework can improve the backbone model KG-BERT (RoBERTa-large) by 3.3 points on the overall performance and especially, 5.3 points on the out-of-domain performance, and achieves the state-of-the-art. The codes will be made public on acceptance. Codes and data are available at https://github.com/HKUST-KnowComp/PseudoReasoner.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [4.071962356567383, 7.363285064697266], "id": 2150, "title": "History-Aware Hierarchical Transformer for Multi-session Open-domain Dialogue System", "authors": "Tong Zhang, Yong Liu, Boyang Li, Zhiwei Zeng, Pengwei Wang, Yuan You, Chunyan Miao and Lizhen Cui", "abstract": "With the evolution of pre-trained language models, current open-domain dialogue systems have achieved great progress in conducting one-session conversations. In contrast, Multi-Session Conversation (MSC), which consists of multiple sessions over a long term with the same user, is under-investigated. In this paper, we propose History-Aware Hierarchical Transformer (HAHT) for multi-session open-domain dialogue. HAHT maintains a long-term memory of history conversations and utilizes history information to understand current conversation context and generate well-informed and context-relevant responses. Specifically, HAHT first encodes history conversation sessions hierarchically into a history memory. Then, HAHT leverages historical information to facilitate the understanding of the current conversation context by encoding the history memory together with the current context with attention-based mechanisms. Finally, to explicitly utilize historical information, HAHT uses a history-aware response generator that switches between a generic vocabulary and a history-aware vocabulary. Experimental results on a large-scale MSC dataset suggest that the proposed HAHT model consistently outperforms baseline models. Human evaluation results support that HAHT generates more human-like, context-relevant, and history-relevant responses than baseline models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.646893262863159, 8.082352638244629], "id": 2161, "title": "Guiding Abstractive Dialogue Summarization with Content Planning", "authors": "Ye Wang, Xiaojun Wan and Zhiping Cai", "abstract": "Abstractive dialogue summarization has recently been receiving more attention. We propose a coarse-to-fine model for generating abstractive dialogue summaries, and introduce a fact-aware reinforcement learning (RL) objective that improves the fact consistency between the dialogue and the generated summary. Initially, the model generates the predicate-argument spans of the dialogue, and then generates the final summary through a fact-aware RL objective. Extensive experiments and analysis on two benchmark datasets demonstrate that our proposed method effectively improves the quality of the generated summary, especially in coherence and consistency.", "track": "Summarization", "label": 14}, {"loc": [6.202732563018799, 8.767577171325684], "id": 2181, "title": "Truncation Sampling as Language Model Desmoothing", "authors": "John Hewitt, Christopher Manning and Percy Liang", "abstract": "Long samples of text from neural language models can be of poor quality. Truncation sampling algorithms--like top-p or top-k---address this by setting some words' probabilities to zero at each step. This work investigates why these methods are important, and how to improve them. We propose thinking of a neural language model as a mixture of a true distribution and a smoothing distribution that avoids infinite perplexity. In this light, truncation algorithms aim to perform desmoothing, estimating a subset of the support of the true distribution. Finding a good subset is crucial: we show that top-p unnecessarily truncates high-probability words, for example causing it to truncate all words but Trump for a document that starts with Donald. We introduce eta-sampling, which truncates words below an entropy-dependent probability threshold. Compared to previous algorithms, our eta-sampling generates more plausible long documents according to humans, is better at breaking out of repetition, and behaves more reasonably on a battery of test distributions.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.565167427062988, 7.698232173919678], "id": 2193, "title": "Knowledge-grounded Dialog State Tracking", "authors": "Dian Yu, Mingqiu Wang, Yuan Cao, Laurent El Shafey, Izhak Shafran and Hagen Soltau", "abstract": "Knowledge (including structured knowledge such as schema and ontology and unstructured knowledge such as web corpus) is a critical part of dialog understanding, especially for unseen tasks and domains. Traditionally, such domain-specific knowledge is encoded implicitly into model parameters for the execution of downstream tasks, which makes training inefficient. In addition , such models are not easily transferable to new tasks with different schemas. In this work, we propose to perform dialog state tracking grounded on knowledge encoded externally. We query relevant knowledge of various forms based on the dialog context where such information can grounds the prediction of dialog states. We demonstrate superior performance of our proposed method over strong baselines, especially in the few-shot learning setting.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.959856033325195, 5.61455774307251], "id": 2200, "title": "Context-aware Information-theoretic Causal De-biasing for Interactive Sequence Labeling", "authors": "Junda Wu, Rui Wang, Tong Yu, Ruiyi Zhang, Handong Zhao, Shuai Li, Ricardo Henao and Ani Nenkova", "abstract": "Supervised training of existing deep learning models for sequence labeling relies on large scale labeled datasets. Such datasets are generally created with crowd-source labeling. However, crowd-source labeling for tasks of sequence labeling can be expensive and time-consuming. Further, crowd-source labeling by external annotators may not be appropriate for data that contains user private information. Considering the above limitations of crowd-source labeling, we study interactive sequence labeling that allows training directly with the user feedback, which alleviates the annotation cost and maintains the user privacy. We identify two bias, namely, context bias and feedback bias, by formulating interactive sequence labeling via a Structural Causal Model (SCM). To alleviate the context and feedback bias based on the SCM, we identify the frequent context tokens as confounders in the backdoor adjustment and further propose an entropy-based modulation that is inspired by information theory. entities more sample-efficiently. With extensive experiments, we validate that our approach can effectively alleviate the biases and our models can be efficiently learnt with the user feedback.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.773019790649414, 4.5846757888793945], "id": 2201, "title": "Simple but Challenging: Natural Language Inference Models Fail on Simple Sentences", "authors": "Cheng Luo, Wei Liu, Jieyu Lin, Jiajie Zou, Ming Xiang and Nai Ding", "abstract": "Natural language inference (NLI) is a task to infer the relationship between a premise and a hypothesis (e.g., entailment, neutral, or contradiction), and transformer-based models perform well on current NLI datasets such as MNLI and SNLI. Nevertheless, given the linguistic complexity of the large-scale datasets, it remains controversial whether these models can truly infer the relationship between sentences or they simply guess the answer via shallow heuristics. Here, we introduce a controlled evaluation set called Simple Pair to test the basic sentence inference ability of NLI models using sentences with syntactically simple structures. Three popular transformer-based models, i.e., BERT, RoBERTa, and DeBERTa, are employed. We find that these models fine-tuned on MNLI or SNLI perform very poorly on Simple Pair (< 35.4% accuracy). Further analyses reveal event coreference and compositional binding problems in these models. To improve the model performance, we augment the training set, i.e., MNLI or SNLI, with a few examples constructed based on Simple Pair (~ 1% of the size of the original SNLI/MNLI training sets). Models fine-tuned on the augmented training set maintain high performance on MNLI/SNLI and perform very well on Simple Pair (~100% accuracy). Furthermore, the positive performance of the augmented training models can transfer to more complex examples constructed based on sentences from MNLI and SNLI. Taken together, the current work shows that (1) models achieving high accuracy on mainstream large-scale datasets still lack the capacity to draw accurate inferences on simple sentences, and (2) augmenting mainstream datasets with a small number of target simple sentences can effectively improve model performance.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [0.7555455565452576, 8.09090805053711], "id": 2210, "title": "DORE: Document Ordered Relation Extraction based on Generative Framework", "authors": "Qipeng Guo, Yuqing Yang, Hang Yan, Xipeng Qiu and Zheng Zhang", "abstract": "In recent years, there is a surge of generation-based information extraction work, which allows a more direct use of pre-trained language models and efficiently captures output dependencies. However, previous generative methods using lexical representation do not naturally fit document-level relation extraction (DocRE) where there are multiple entities and relational facts. In this paper, we investigate the root cause of the underwhelming performance of the existing generative DocRE models and discover that the culprit is the inadequacy of the training paradigm, instead of the capacities of the models. We propose to generate a symbolic and ordered sequence from the relation matrix which is deterministic and easier for model to learn. Moreover, we design a parallel row generation method to process overlong target sequences. Besides, we introduce several negative sampling strategies to improve the performance with balanced signals. Experimental results on four datasets show that our proposed method can improve the performance of the generative DocRE models.", "track": "Information Extraction", "label": 5}, {"loc": [2.412846565246582, 7.310841083526611], "id": 2214, "title": "Explicit Role Interaction Network for Event Argument Extraction", "authors": "Nan Ding, Chunming Hu, Kai Sun, Samuel Mensah and Richong Zhang", "abstract": "Event argument extraction is a challenging subtask of event extraction, aiming to identify and assign roles to arguments under a certain event. Existing methods extract arguments of each role independently, ignoring the relationship between different roles. Such an approach hinders the model from learning explicit interactions between different roles to improve the performance of individual argument extraction. As a solution, we design a neural model that we refer to as the Explicit Role Interaction Network (ERIN) which allows for dynamically capturing the correlations between different argument roles within an event. Extensive experiments on the benchmark dataset ACE2005 demonstrate the superiority of our proposed model to existing approaches.", "track": "Information Extraction", "label": 5}, {"loc": [7.91349458694458, 9.392195701599121], "id": 2218, "title": "Few-Shot Out-of-Domain Transfer Learning of Natural Language Explanations in a Label-Abundant Setup", "authors": "Yordan Yordanov, Vid Kocijan, Thomas Lukasiewicz and Oana-Maria Camburu", "abstract": "Training a model to provide natural language explanations (NLEs) for its predictions usually requires the acquisition of task-specific NLEs, which is time- and resource-consuming. A potential solution is the few-shot out-of-domain transfer of NLEs from a parent task with many NLEs to a child task.\nIn this work, we examine the setup in which the child task has few NLEs but abundant labels. We establish four few-shot transfer learning methods that cover the possible fine-tuning combinations of the labels and NLEs for the parent and child tasks. We transfer explainability from a large natural language inference dataset (e-SNLI) separately to two child tasks: (1) hard cases of pronoun resolution, where we introduce the small-e-WinoGrande dataset of NLEs on top of the WinoGrande dataset, and (2)~commonsense validation (ComVE). Our results demonstrate that the parent task helps with NLE generation and we establish the best methods for this setup.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.707474231719971, 3.559474229812622], "id": 2238, "title": "RoChBert: Towards Robust BERT Fine-tuning for Chinese", "authors": "Zihan Zhang, Jinfeng Li, Ning Shi, Bo Yuan, Xiangyu Liu, Rong Zhang, hui xue, DONGHONG SUN and Chao Zhang", "abstract": "Despite of the superb performance on a wide range of tasks, pre-trained language models (e.g., BERT) have been proved vulnerable to adversarial texts. In this paper, we present RoChBERT, a framework to build more Robust BERT-based models by utilizing a more comprehensive adversarial graph to fuse Chinese phonetic and glyph features into pre-trained representations during fine-tuning. Inspired by curriculum learning, we further propose to augment the training dataset with adversarial texts in combination with intermediate samples. Extensive experiments demonstrate that RoChBERT outperforms previous methods in significant ways: (i) robust -- RoChBERT greatly improves the model robustness without sacrificing accuracy on benign texts. Specifically, the defense lowers the success rates of unlimited and limited attacks by 59.43% and 39.33% respectively, while remaining accuracy of 93.30%; (ii) flexible -- RoChBERT can easily extend to various language models to solve different downstream tasks with excellent performance; and (iii) efficient -- RoChBERT can be directly applied to the fine-tuning stage without pre-training language model from scratch, and the proposed data augmentation method is also low-cost.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.198788642883301, 5.789646625518799], "id": 2246, "title": "Lexical Entailment with Hierarchy Representations by Deep Metric Learning", "authors": "Naomi Sato, Masaru Isonuma, Kimitaka Asatani, Shoya Ishizuka, Aori Shimizu and Ichiro Sakata", "abstract": "In this paper, we introduce a novel method for lexical entailment tasks, which detects a hyponym-hypernym relation among words. Existing lexical entailment studies are lacking in generalization performance, as they cannot be applied to words that are not included in the training dataset. Moreover, existing work evaluates the performance by using the dataset that contains words used for training. This study proposes a method that learns a mapping from word embeddings to the hierarchical embeddings in order to predict the hypernymy relations of any input words. To validate the generalization performance, we conduct experiments using a train dataset that does not overlap with the evaluation dataset. As a result, our method achieved state-of-the-art performance and showed robustness for unknown words.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.045004844665527, 8.828048706054688], "id": 2248, "title": "Improving the Sample Efficiency of Prompt Tuning with Domain Adaptation", "authors": "Xu Guo, Boyang Li and Han Yu", "abstract": "Prompt tuning, or the conditioning of a frozen pretrained language model (PLM) with soft prompts learned from data, has demonstrated impressive performance on a wide range of NLP tasks. However, prompt tuning requires a large training dataset to be effective and is outperformed by finetuning the entire PLM in data-scarce regimes. Previous work (Gu et al., 2022, Vu et al., 2022) proposed to transfer soft prompts pretrained on the source domain to the target domain. In this paper, we explore domain adaptation for prompt tuning, a problem setting where unlabeled data from the target domain are available during pretraining. We propose bOosting Prompt TunIng with doMain Adaptation (OPTIMA), which regularizes the decision boundary to be smooth around regions where source and target data distributions are similar. Extensive experiments demonstrate that OPTIMA significantly enhances the transferability and sample-efficiency of prompt tuning compared to strong baselines. Moreover, in few-shot settings, OPTIMA exceeds full-model tuning by a large margin.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.98429536819458, 5.755064010620117], "id": 2261, "title": "McPhraSy: Multi-Context Phrase Similarity and Clustering", "authors": "Amir Cohen, Hila Gonen, Ori Shapira, Ran Levy and Yoav Goldberg", "abstract": "Phrase similarity is a key component of many NLP applications. Current phrase similarity methods focus on embedding the phrase itself and use the phrase context only during training of the pretrained model. To better leverage the information in the context, we propose McPhraSy (Multi-context Phrase Similarity), a novel algorithm for estimating the similarity of phrases based on multiple contexts. At inference time, McPhraSy represents each phrase by considering multiple contexts in which it appears and computes the similarity of two phrases by aggregating the pairwise similarities between the contexts of the phrases. Incorporating context during inference enables McPhraSy to outperform current state-of-the-art models on two phrase similarity datasets by up to 13.3%. Finally, we also present a new downstream task that relies on phrase similarity \u2013 keyphrase clustering \u2013 and create a new benchmark for it in the product reviews domain. We show that McPhraSy surpasses all other baselines for this task.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.524389266967773, 9.149909019470215], "id": 2263, "title": "CANarEx: Contextually Aware Narrative Extraction for Semantically Rich Text-as-data Applications", "authors": "Nandini Anantharama, Simon D. Angus and Lachlan O'Neill", "abstract": "Narrative modelling is an area of active research, motivated by the acknowledgement of narratives as drivers of societal decision making. These research efforts conceptualize narratives as connected entity chains, and modeling typically focuses on the identification of entities and their connections within a text. An emerging approach to narrative modelling is the use of semantic role labeling (SRL) to extract Entity-Verb-Entity (E-V-Es) tuples from a text, followed by dimensionality reduction to reduce the space of entities and connections separately. This process penalises the semantic richness of narratives and discards much contextual information along the way. Here, we propose an alternate narrative extraction approach - CANarEx, incorporating a pipeline of common contextual constructs through co-reference resolution, micro-narrative generation and clustering of these narratives through sentence embeddings. We evaluate our approach through testing the recovery of \"narrative time-series clusters\", mimicking a desirable text-as-data task. The evaluation framework leverages synthetic data generated using a GPT-3 model. The GPT-3 model is trained to generate similar sentences using a large dataset of news articles. The synthetic data maps to three topics in the news dataset. We then generate narrative time-series document cluster representations by mapping the synthetic data to three distinct signals synthetically injected into the testing corpus. Evaluation results demonstrate the superior ability of CANarEx to recover narrative time-series through reduced MSE and improved precision/recall relative to existing methods. The validity is further reinforced through ablation studies and qualitative analysis.", "track": "NLP Applications", "label": 0}, {"loc": [3.6368181705474854, 8.046760559082031], "id": 2283, "title": "Narrate Dialogues for Better Summarization", "authors": "Ruochen Xu, Chenguang Zhu and Michael Zeng", "abstract": "Dialogue summarization models aim to generate a concise and accurate summary for multi-party dialogue. The complexity of dialogue, including coreference, dialogue acts, and inter-speaker interactions bring unique challenges to dialogue summarization. Most recent neural models achieve state-of-art performance following the pretrain-then-finetune recipe, where the large-scale language model (LLM) is pretrained on large-scale single-speaker written text, but later finetuned on multi-speaker dialogue text. To mitigate the gap between pretraining and finetuning, we propose several approaches to convert the dialogue into a third-person narrative style and show that the narration serves as a valuable annotation for LLMs. Empirical results on three benchmark datasets show our simple approach achieves higher scores on the ROUGE and a factual correctness metric.", "track": "Summarization", "label": 14}, {"loc": [0.7798523902893066, 8.095955848693848], "id": 2306, "title": "CrossRE: A Cross-Domain Dataset for Relation Extraction", "authors": "Elisa Bassignana and Barbara Plank", "abstract": "Relation Extraction (RE) has attracted increasing attention, but current RE evaluation is limited to in-domain evaluation setups. Little is known on how well a RE system fares in challenging, but realistic out-of-distribution evaluation setups. To address this gap, we propose CrossRE, a new, freely-available cross-domain benchmark for RE, which comprises six distinct text domains and includes multi-label annotations. An additional innovation is that we release meta-data collected during annotation, to include explanations and flags of difficult instances. We provide an empirical evaluation with a state-of-the-art model for relation classification. As the meta-data enables us to shed new light on the state-of-the-art model, we provide a comprehensive analysis on the impact of difficult cases and find correlations between model and human annotations. Overall, our empirical investigation highlights the difficulty of cross-domain RE. We release our dataset, to spur more research in this direction.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.7872440814971924, 7.113026142120361], "id": 2309, "title": "Probing Structural Knowledge from Pre-trained Language Model for Argumentation Relation Classification", "authors": "Yang Sun, Bin Liang, Jianzhu Bao, Min Yang and Ruifeng Xu", "abstract": "Extracting fine-grained structural information between argumentation component (AC) pairs is essential for argumentation relation classification (ARC). However, most previous studies attempt to model the relationship between AC pairs using AC level similarity or semantically relevant features. They ignore the complex interaction between AC pairs and cannot effectively reason the argumentation relation deeply.\nTherefore, in this paper, we propose a novel \\underline{d}ual \\underline{p}rior \\underline{g}raph \\underline{n}eural \\underline{n}etwork (DPGNN) to jointly explore the probing knowledge derived from pre-trained language models (PLMs) and the syntactical information for comprehensively modeling the relationship between AC pairs. Specifically, we construct a probing graph by using probing knowledge derived from PLMs to recognize and align the relational information within and across the argumentation components. In addition, we propose a mutual dependency graph for the AC pair to reason the fine-grained syntactic structural information, in which the syntactical correlation between words is set by the dependency information within AC and mutual attention mechanism across ACs. The knowledge learned from the probing graph and the dependency graph are combined to comprehensively capture the aligned relationships of AC pairs for improving the results of ARC. Experimental results on three public datasets show that DPGNN outperforms the state-of-the-art baselines by a noticeable margin.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.577014923095703, 4.359799385070801], "id": 2310, "title": "LogicNMR: Probing the Non-monotonic Reasoning Ability of Pre-trained Language Models", "authors": "Yeliang Xiu, Zhanhao Xiao and Yongmei Liu", "abstract": "The logical reasoning capabilities of pre-trained language models have recently received much attention. As one of the vital reasoning paradigms, non-monotonic reasoning refers to the fact that conclusions may be invalidated with new information. Existing work has constructed a non-monotonic inference dataset $\\delta$-NLI and explored the performance of language models on it. However, the $\\delta$-NLI dataset is entangled with commonsense reasoning. In this paper, we explore the pure non-monotonic reasoning ability of pre-trained language models. \nWe build a non-monotonic reasoning benchmark, named LogicNMR, with explicit default rules and iterative updates. In the experimental part, the performance of popular language models on LogicNMR is explored from the perspectives of accuracy, generalization, proof-based traceability and robustness. The experimental results show that even though the fine-tuned language models achieve an accuracy of more than 94.4\\% on LogicNMR, they perform unsatisfactorily, with a significant drop, in generalization and proof-based traceability.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.700573444366455, 4.857706546783447], "id": 2313, "title": "Cheater's Bowl: Human vs. Computer Search Strategies for Open-Domain QA", "authors": "Wanrong He, Andrew Mao and Jordan Boyd-Graber", "abstract": "For humans and computers, the first step in answering an open-domain question is retrieving a set of relevant documents from a large corpus. However, the strategies that computers use fundamentally differ from those of humans. To better understand these differences, we design a gamified interface for data collection -- Cheater's Bowl -- where a human answers complex questions with access to both traditional and modern search tools. We collect a dataset of human search sessions, analyze human search strategies and compare them to state-of-the-art multi-hop QA models. We show that humans query logically, apply dynamic search chains and utilize world knowledge to boost searching. We demonstrate how human queries can improve the accuracy of existing systems and propose the future design of QA models.", "track": "Question Answering", "label": 11}, {"loc": [3.9013633728027344, 9.77903938293457], "id": 2337, "title": "FRSUM: Towards Faithful Abstractive Summarization via Enhancing Factual Robustness", "authors": "Wenhao Wu, Wei Li, Jiachen Liu, Xinyan Xiao, Ziqiang Cao, Sujian Li and Hua Wu", "abstract": "Despite being able to generate fluent and grammatical text, current Seq2Seq summarization models still suffering from the unfaithful generation problem.\nIn this paper, we study the faithfulness of existing systems from a new perspective of factual robustness which is the ability to correctly generate factual information over adversarial unfaithful information.\nWe first measure a model's\nfactual robustness by its success rate to defend against adversarial attacks when generating factual information.\nThe factual robustness analysis on a wide range of current systems shows its good consistency with human judgments on faithfulness.\nInspired by these findings, we propose to improve the faithfulness of a model by enhancing its factual robustness.\nSpecifically, we propose a novel training strategy, namely FRSUM, which teaches the model to defend against both explicit adversarial samples and implicit factual adversarial perturbations.\nExtensive automatic and human evaluation results show that FRSUM consistently improves the faithfulness of various Seq2Seq models, such as T5, BART.", "track": "Summarization", "label": 14}, {"loc": [5.103869438171387, 8.885583877563477], "id": 2348, "title": "PoeLM: A Meter- and Rhyme-Controllable Language Model for Unsupervised Poetry Generation", "authors": "Aitor Ormazabal, Mikel Artetxe, Manex Agirrezabal, Aitor Soroa and Eneko Agirre", "abstract": "Formal verse poetry imposes strict constraints on the meter and rhyme scheme of poems. Most prior work on generating this type of poetry uses existing poems for supervision, which are difficult to obtain for most languages and poetic forms. In this work, we propose an unsupervised approach to generate poems that follow any given meter and rhyme scheme, without requiring any poetic text for training. Our method works by splitting a regular, non-poetic corpus into phrases, prepending control codes that describe the length and end rhyme of each phrase, and training a transformer language model in the augmented corpus. The transformer learns to link the structure descriptor with the control codes to the number of lines, their length and their end rhyme. During inference, we build control codes for the desired meter and rhyme scheme, and condition our language model on them to generate formal verse poetry. Experiments in Spanish and Basque show that our approach is able to generate valid poems, which are often comparable in quality to those written by humans.", "track": "Natural Language Generation", "label": 6}, {"loc": [7.970156192779541, 9.71275806427002], "id": 2349, "title": "ProGen: Progressive Zero-shot Dataset Generation via In-context Feedback", "authors": "Jiacheng Ye, Jiahui Gao, Zhiyong Wu, Jiangtao Feng, Tao Yu and Lingpeng Kong", "abstract": "Recently, dataset-generation-based zero-shot learning has shown promising results by training a task-specific model with a dataset synthesized from large pre-trained language models (PLMs). \n The final task-specific model often achieves compatible or even better performance than PLMs under the zero-shot setting, with orders of magnitude fewer parameters.\nHowever, synthetic datasets have their drawbacks. They have long being suffering from the low-quality issue (e.g., low informativeness, redundancy). This explains why the massive synthetic data does not lead to better performance -- a scenario we would expect in the human-labeled data. \nTo improve the quality in dataset synthesis, we propose a progressive zero-shot dataset generation framework, ProGen, which leverages the feedback from the task-specific model to guide the generation of new training data via in-context examples.\nExtensive experiments on five text classification datasets demonstrate the effectiveness of the proposed approach. \nWe also show ProGen achieves on-par or superior performance with only 1% synthetic dataset size, when comparing to baseline methods without in-context feedback.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.829037189483643, 7.315104007720947], "id": 2368, "title": "Constructing Highly Inductive Contexts for Dialogue Safety through Controllable Reverse Generation", "authors": "Zhexin Zhang, Jiale Cheng, Hao Sun, Jiawen Deng, Fei Mi, Yasheng Wang, Lifeng Shang and Minlie Huang", "abstract": "Large pretrained language models can easily produce toxic or biased content, which is prohibitive for practical use. In order to detect such toxic generations, existing methods rely on templates, real-world data extraction, crowdsourcing workers or automatic generation to construct adversarial contexts that are likely to induce toxic generations. However, what type of context is more likely to induce unsafe responses is still under-explored. In this paper, we identify that context toxicity and context category (e.g., profanity, insult, drugs, etc.) are two important factors to cause safety issues in response generation. Hence, we propose a method called reverse generation to construct adversarial contexts conditioned on a given response, with the flexibility to control category, toxicity level and inductivity of the generated contexts. Via reverse generation, we augment the existing BAD dataset and construct a new dataset BAD+ which contains more than 120K diverse and highly inductive contexts in 12 categories. We test three popular pretrained dialogue models (Blender, DialoGPT and Plato2) and find that BAD+ can largely expose their safety problems. Furthermore, we show that BAD+ can greatly enhance the safety of generation, and we reveal the key factors of safety improvement. Our code and dataset is available at \\url{https://github.com/thu-coai/Reverse_Generation}.", "track": "Ethic Concerns:Dialogue and Interactive Systems", "label": 4}, {"loc": [3.373538017272949, 4.544143199920654], "id": 2375, "title": "Language Prior Is Not the Only Shortcut: A Benchmark for Shortcut Learning in VQA", "authors": "Qingyi Si, Fandong Meng, Mingyu Zheng, Zheng Lin, Yuanxin LIU, Peng Fu, Yanan Cao, Weiping Wang and Jie Zhou", "abstract": "Visual Question Answering (VQA) models are prone to learn the shortcut solution formed by dataset biases rather than the intended solution. To evaluate the VQA models' reasoning ability beyond shortcut learning, the VQA-CP v2 dataset introduces a distribution shift between the training and test set given a question type. In this way, the model cannot use the training set shortcut (from question type to answer) to perform well on the test set. However, VQA-CP v2 only considers one type of shortcut and thus still cannot guarantee that the model relies on the intended solution rather than a solution specific to this shortcut. To overcome this limitation, we propose a new dataset that considers varying types of shortcuts by constructing different distribution shifts in multiple OOD test sets. In addition, we overcome the three troubling practices in the use of VQA-CP v2, e.g., selecting models using OOD test sets, and further standardize OOD evaluation procedure. Our benchmark provides a more rigorous and comprehensive testbed for shortcut learning in VQA. We benchmark recent methods and find that methods specifically designed for particular shortcuts fail to simultaneously generalize to our varying OOD test sets. We also systematically study the varying shortcuts and provide several valuable findings, which may promote the exploration of shortcut learning in VQA.", "track": "Question Answering", "label": 11}, {"loc": [1.8603405952453613, 4.00204610824585], "id": 2382, "title": "Bridging the Training-Inference Gap for Dense Phrase Retrieval", "authors": "Gyuwan Kim, Jinhyuk Lee, Barlas Oguz, Wenhan Xiong, Yizhe Zhang, Yashar Mehdad and William Yang Wang", "abstract": "Building dense retrievers requires a series of standard procedures, including training and validating neural models and creating indexes for efficient search. However, these procedures are often misaligned in that training objectives do not exactly reflect the retrieval scenario at inference time. In this paper, we explore how the gap between training and inference in dense retrieval can be reduced, focusing on dense phrase retrieval (Lee et al., 2021) where billions of representations are indexed at inference. Since validating every dense retriever with a large-scale index is practically infeasible, we propose an efficient way of validating dense retrievers using a small subset of the entire corpus. This allows us to validate various training strategies including unifying contrastive loss terms and using hard negatives for phrase retrieval, which largely reduces the training-inference discrepancy. As a result, we improve top-1 phrase retrieval accuracy by 2~3 points and top-20 passage retrieval accuracy by 2~4 points for open-domain question answering. Our work urges modeling dense retrievers with careful consideration of training and inference via efficient validation while advancing phrase retrieval as a general solution for dense retrieval.", "track": "Question Answering", "label": 11}, {"loc": [8.619942665100098, 6.743721961975098], "id": 2396, "title": "Beyond Counting Datasets: A Survey of Multilingual Dataset Construction and Necessary Resources", "authors": "Xinyan Yu, Trina Chatterjee, Akari Asai, Junjie Hu and Eunsol Choi", "abstract": "While the NLP community is generally aware of resource disparities among languages, we lack research that quantifies the extent and types of such disparity. Prior surveys estimating the availability of resources based on the number of datasets can be misleading as dataset quality varies: many datasets are automatically induced or translated from English data. To provide a more comprehensive picture of language resources, we examine the characteristics of 156 publicly available NLP datasets. We manually annotate how they are created, including input text and label sources and tools used to build them, and what they study, tasks they address and motivations for their creation. After quantifying the qualitative NLP resource gap across languages, we discuss how to improve data collection in low-resource languages. We survey language-proficient NLP researchers and crowd workers per language, finding that their estimated availability correlates with dataset availability. Through crowdsourcing experiments, we identify strategies for collecting high-quality multilingual data on the Mechanical Turk platform. We conclude by making macro and micro-level suggestions to the NLP community and individual researchers for future multilingual data development.", "track": "Theme Track", "label": 18}, {"loc": [5.734128475189209, 11.871498107910156], "id": 2403, "title": "ERNIE-Layout: Layout Knowledge Enhanced Pre-training for Visually-rich Document Understanding", "authors": "Qiming Peng, Yinxu Pan, Wenjin Wang, Bin Luo, Zhenyu Zhang, Zhengjie Huang, Yuhui Cao, Weichong Yin, Yongfeng Chen, Yin Zhang, Shikun Feng, Yu Sun, Hao Tian, Hua Wu and Haifeng Wang", "abstract": "Recent years have witnessed the rise and success of pre-training techniques in visually-rich document understanding. However, most existing methods lack the systematic mining and utilization of layout-centered knowledge, leading to sub-optimal performances. In this paper, we propose ERNIE-Layout, a novel document pre-training solution with layout knowledge enhancement in the whole workflow, to learn better representations that combine the features from text, layout, and image. Specifically, we first rearrange input sequences in the serialization stage, and then present a correlative pre-training task, reading order prediction, to learn the proper reading order of documents. To improve the layout awareness of the model, we integrate a spatial-aware disentangled attention into the multi-modal transformer and a replaced regions prediction task into the pre-training phase. Experimental results show that ERNIE-Layout achieves superior performance on various downstream tasks, setting new state-of-the-art on key information extraction, document image classification, and document question answering datasets. The code and models are publicly available at PaddleNLP.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.903752326965332, 4.743311405181885], "id": 2428, "title": "Do Charge Prediction Models Learn Legal Theory?", "authors": "An Zhenwei, Quzhe Huang, Cong Jiang, Yansong Feng and Dongyan Zhao", "abstract": "The charge prediction task aims to predict the charge for a case given its fact description. Recent models have already achieved impressive accuracy in this task, however, little is understood about the mechanisms they use to perform the judgment.\nFor practical applications, a charge prediction model should conform to the certain legal theory in civil law countries, as under the framework of civil law, all cases are judged according to certain local legal theories. In China, for example, nearly all criminal judges make decisions based on the Four Elements Theory (FET).\nIn this paper, we argue that trustworthy charge prediction models should take legal theories into consideration, and standing on prior studies in model interpretation, we propose three principles for trustworthy models should follow in this task, which are \\texttt {sensitive}, \\texttt{selective}, and \\texttt{presumption of innocence}.\nWe further design a new framework to evaluate whether existing charge prediction models learn legal theories. Our findings indicate that, while existing charge prediction models meet the \\texttt{selective} principle on a benchmark dataset, most of them are still not \\texttt{sensitive} enough and do not satisfy the \\texttt{presumption of innocence}. Our code and dataset are released at \\url{https://github.com/ZhenweiAn/EXP_LJP}.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.286746978759766, 7.514503479003906], "id": 2445, "title": "Keep Me Updated! Memory Management in Long-term Conversations", "authors": "Sanghwan Bae, Donghyun Kwak, Soyoung Kang, Min Young Lee, Sungdong Kim, Yuin Jeong, Hyeri Kim, Sang-Woo Lee, Woomyoung Park and Nako Sung", "abstract": "Remembering important information from the past and continuing to talk about it in the present are crucial in long-term conversations. However, previous literature does not deal with cases where the memorized information is outdated, which may cause confusion in later conversations. To address this issue, we present a novel task and a corresponding dataset of memory management in long-term conversations, in which bots keep track of and bring up the latest information about users while conversing through multiple sessions. In order to support more precise and interpretable memory, we represent memory as unstructured text descriptions of key information and propose a new mechanism of memory management that selectively eliminates invalidated or redundant information. Experimental results show that our approach outperforms the baselines that leave the stored memory unchanged in terms of engagingness and humanness, with larger performance gap especially in the later sessions.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.509362697601318, 7.649197101593018], "id": 2456, "title": "A Unified Dialogue User Simulator for Few-shot Data Augmentation", "authors": "Dazhen Wan, Zheng Zhang, Qi Zhu, Lizi Liao and Minlie Huang", "abstract": "Pre-trained language models have shown superior performance in task-oriented dialogues. However, existing datasets are on limited scales, which cannot support large-scale pre-training. Fortunately, various data augmentation methods have been developed to augment large-scale task-oriented dialogue corpora. However, they heavily rely on annotated data in the target domain, which require a tremendous amount of data collection and human labeling work. In this paper, we build a unified dialogue user simulation model by pre-training on several publicly available datasets. The model can then be tuned on a target domain with few-shot data. The experiments on a target dataset across multiple domains show that our proposed model brings remarkable performance increases through data augmentation.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.613858699798584, 12.31232738494873], "id": 2469, "title": "Human-in-the-Loop Hate Speech Classification in a Multilingual Context", "authors": "Ana Kotarcic, Dominik Hangartner, Fabrizio Gilardi, Selina Kurer and Karsten Donnay", "abstract": "The shift of public debate to the digital sphere has been accompanied by a rise in online hate speech. While many promising approaches for hate speech classification have been proposed, studies often focus only on a single language, usually English, and do not address three key concerns: post-deployment performance, classifier maintenance and infrastructural limitations. In this paper, we introduce a new human-in-the-loop BERT-based hate speech classification pipeline and trace its development from initial data collection and annotation all the way to post-deployment. Our classifier, trained using data from our original corpus of over 422k examples, is specifically developed for the inherently multilingual setting of Switzerland and outperforms with its F1 score of 80.5 the currently best-performing BERT-based multilingual classifier by 5.8 F1 points in German and 3.6 F1 points in French. Our systematic evaluations over a 12-month period further highlight the vital importance of continuous, human-in-the-loop classifier maintenance to ensure robust hate speech classification post-deployment.", "track": "NLP Applications", "label": 0}, {"loc": [10.828190803527832, 9.34191608428955], "id": 2472, "title": "An Error-Guided Correction Model for Chinese Spelling Error Correction", "authors": "Rui Sun, Xiuyu Wu and Yunfang Wu", "abstract": "Although existing neural network approaches have achieved great progress on Chinese spelling correction, there is still room to improve. The model is required to avoid over-correction and to distinguish a correct token from its phonological and visual similar ones. In this paper, we propose an error-guided correction model to address these issues. By borrowing the powerful ability of the pre-trained BERT model, we propose a novel zero-shot error detection method to do a preliminary detection, which guides our model to attend more on the probably wrong tokens in encoding and to avoid modifying the correct tokens in generating. Furthermore, we introduce a new loss function to integrate the error confusion set, which enables our model to distinguish similar tokens. Moreover, our model supports highly parallel decoding to meet real applications. Experiments are conducted on widely used benchmarks. Our model achieves superior performance against state-of-the-art approaches by a remarkable margin, on both the quality and computation speed.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.474376678466797, 12.463579177856445], "id": 2481, "title": "Describing Sets of Images with Textual-PCA", "authors": "Oded Hupert, Idan Schwartz and Lior Wolf", "abstract": "We seek to semantically describe a set of images, capturing both the attributes of single images and the variations within the set. Our procedure is analogous to Principle Component Analysis, in which the role of projection vectors is replaced with generated phrases. First, a centroid phrase that has the largest average semantic similarity to the images in the set is generated, where both the computation of the similarity and the generation are based on pretrained vision-language models. Then, the phrase that generates the highest variation among the similarity scores is generated, using the same models. The next phrase maximizes the variance subject to being orthogonal, in the latent space, to the highest-variance phrase, and the process continues. Our experiments show that our method is able to convincingly capture the essence of image sets and describe the individual elements in a semantically meaningful way within the context of the entire set. Our code is available at: \\url{https://github.com/OdedH/textual-pca}.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.723360061645508, 8.940983772277832], "id": 2484, "title": "Learning to Model Editing Processes", "authors": "Machel Reid and Graham Neubig", "abstract": "Most existing sequence generation models produce outputs in one pass, usually left-to-right. However, this is in contrast with a more natural approach that humans use in generating content; iterative refinement and editing. Recent work has introduced edit-based models for various tasks (such as neural machine translation and text style transfer), but these generally model a single edit step. In this work, we propose modeling editing processes, modeling the whole process of iteratively generating sequences. We form a conceptual framework to describe the likelihood of multi-step edits, and describe neural models that can learn a generative model of sequences based on these multistep edits. We introduce baseline results and metrics on this task, finding that modeling editing processes improves performance on a variety of axes on both our proposed task and related downstream tasks compared to previous single-step models of edits.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [0.7180464267730713, 6.795099258422852], "id": 2487, "title": "PALT: Parameter-Lite Transfer of Language Models for Knowledge Graph Completion", "authors": "Jianhao Shen, Chenguang Wang, Ye Yuan, Jiawei Han, Heng Ji, Koushik Sen, Ming Zhang and Dawn Song", "abstract": "This paper presents a parameter-lite transfer learning approach of pretrained language models (LM) for knowledge graph (KG) completion. Instead of finetuning, which modifies all LM parameters, we only tune a few new parameters while keeping the original LM parameters fixed. We establish this via reformulating KG completion as a \"fill-in-the-blank'' task, and introducing a parameter-lite encoder on top of the original LMs. We show that, by tuning far fewer parameters than finetuning, LMs transfer non-trivially to most tasks and reach competitiveness with prior state-of-the-art approaches. For instance, we outperform the fully finetuning approaches on a KG completion benchmark by tuning only 1% of the parameters.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.756138563156128, 5.894388198852539], "id": 2489, "title": "Prompt-based Connective Prediction Method for Fine-grained Implicit Discourse Relation Recognition", "authors": "Hao Zhou, Man Lan, Yuanbin Wu, Yuefeng Chen and Meirong Ma", "abstract": "Due to the absence of connectives, implicit discourse relation recognition (IDRR) is still a challenging and crucial task in discourse analysis. Most of the current work adopted multitask learning to aid IDRR through explicit discourse relation recognition (EDRR) or utilized dependencies between discourse relation labels to constrain model predictions. But these methods still performed poorly on fine-grained IDRR and even utterly misidentified on most of the few-shot discourse relation classes. To address these problems, we propose a novel Prompt-based Connective Prediction (PCP) method for IDRR. Our method instructs large-scale pre-trained models to use knowledge relevant to discourse relation and utilizes the strong correlation between connectives and discourse relation to help the model recognize implicit discourse relations. Experimental results show that our method surpasses the current state-of-the-art model and achieves significant improvements on those fine-grained few-shot discourse relation. Moreover, our approach is able to be transferred to EDRR and obtain acceptable results. Our code is released in https://github.com/zh-i9/PCP-for-IDRR.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [9.656222343444824, 6.321267127990723], "id": 2514, "title": "On Utilizing Constituent Language Resources to Improve Downstream Tasks in Hinglish", "authors": "Vishwajeet Kumar, Rudra Murthy and Tejas Dhamecha", "abstract": "Performance of downstream NLP tasks on code-switched Hindi-English (aka \\hgl) continues to remain a significant challenge. Intuitively, Hindi and English corpora should aid improve task performance on Hinglish. We show that meta-learning framework can effectively utilize the the labelled resources of the downstream tasks in the constituent\\footnote{We use the term \\textit{constituent} to jointly refer to the matrix and embedding languages.} languages. The proposed approach improves the performance on downstream tasks on code-switched language. We experiment with \\hgl code-switching benchmark GLUECoS and report significant improvements.", "track": "Multilinguality", "label": 13}, {"loc": [1.7021663188934326, 5.389439105987549], "id": 2532, "title": "SYGMA: A System for Generalizable and Modular Question Answering Over Knowledge Bases", "authors": "Sumit Neelam, Udit Sharma, Hima Karanam, Shajith Ikbal, Pavan Kapanipathi, Ibrahim Abdelaziz, Nandana Mihindukulasooriya, Young-Suk Lee, Santosh Srivastava, Cezar Pendus, Saswati Dana, Dinesh Garg, Achille Fokoue, G P Shrivatsa Bhargav, Dinesh Khandelwal, Srinivas Ravishankar, Sairam Gurajada, Maria Chang, Rosario Uceda-Sosa, Salim Roukos, Alexander Gray, Guilherme Lima, Ryan Riegel, Francois Luus and L V Subramaniam", "abstract": "Knowledge Base Question Answering (KBQA) involving complex reasoning is emerging as an important research direction. However, most KBQA systems struggle with generalizability, particularly on two dimensions: (a) across multiple knowledge bases, where existing KBQA approaches are typically tuned to a single knowledge base, and (b) across multiple reasoning types, where majority of datasets and systems have primarily focused on multi-hop reasoning. In this paper, we present SYGMA, a modular KBQA approach developed with goal of generalization across multiple knowledge bases and multiple reasoning types. To facilitate this, SYGMA is designed as two high level modules: 1) KB-agnostic question understanding module that remain common across KBs, and generates logic representation of the question with high level reasoning constructs that are extensible, and 2) KB-specific question mapping and answering module to address the KB-specific aspects of the answer extraction. We evaluated SYGMA on multiple datasets belonging to distinct knowledge bases (DBpedia and Wikidata) and distinct reasoning types (multi-hop and temporal). State-of-the-art or competitive performances achieved on those datasets demonstrate its generalization capability.", "track": "Question Answering", "label": 11}, {"loc": [8.055648803710938, 9.392278671264648], "id": 2546, "title": "Instance-Guided Prompt Learning for Few-Shot Text Matching", "authors": "Jia Du, Xuanyu Zhang, siyi wang, Kai Wang, Yanquan Zhou, Lei Li, Qing Yang and Dongliang Xu", "abstract": "Few-shot text matching is a more practical technique in natural language processing (NLP) to determine whether two texts are semantically identical. They primarily design patterns to reformulate text matching into a pre-trained task with uniform prompts across all instances. But they fail to take into account the connection between prompts and instances. This paper argues that dynamically strengthening the correlation between particular instances and the prompts is necessary because fixed prompts cannot adequately fit all diverse instances in inference. We suggest IGATE: Instance-Guided prompt leArning for few-shoT tExt matching, a novel pluggable prompt learning method. The gate mechanism used by IGATE, which is between the embedding and the PLM encoders, makes use of the semantics of instances to regulate the effects of the gate on the prompt tokens. The experimental findings show that IGATE achieves SOTA performance on MRPC and QQP, outperforming strong baselines. GitHub will host the release of codes.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.308990001678467, 9.415026664733887], "id": 2549, "title": "M3: Multi-level dataset for Multi-document summarisation of Medical studies", "authors": "Yulia Otmakhova, Karin Verspoor, Timothy Baldwin, Antonio Jimeno Yepes and Jey Han Lau", "abstract": "We present M3 (Multi-level dataset for Multi-document summarisation of Medical studies), a benchmark dataset for evaluating the quality of summarisation systems in the biomedical domain. The dataset contains sets of multiple input documents and target summaries of three levels of complexity: documents, sentences, and propositions. The dataset also includes several levels of annotation, including biomedical entities, direction, and strength of relations between them, and the discourse relationships between the input documents (\"contradiction'' or \"agreement''). We showcase usage scenarios of the dataset by testing 10 generic and domain-specific summarisation models in a zero-shot setting, and introduce a probing task based on counterfactuals to test if models are aware of the direction and strength of the conclusions generated from input studies.", "track": "Summarization", "label": 14}, {"loc": [0.7095517516136169, 6.945507526397705], "id": 2565, "title": "A Knowledge-Enhanced Multilingual Language Model for Both Knowledge Graph and Language Tasks", "authors": "Yifan Hou, Wenxiang Jiao, Meizhen Liu, Carl Allen, Zhaopeng Tu and Mrinmaya Sachan", "abstract": "Large language models (LMs) appear to learn facts from the large text corpora they are trained on. Such facts are encoded implicitly within their many parameters, making it difficult to verify or manipulate what knowledge has been learned. LMs have recently been extended to multilingual LMs (MLLMs), enabling knowledge to be learned across hundreds of languages. Meanwhile, knowledge graphs (KGs) contain facts in an explicit triple format, which require careful and costly curation and are only available in a few high-resource languages, restricting their research and application. To address this, we propose to enhance MLLMs with the knowledge of multilingual knowledge graphs (MLKGs) so as to tackle language and KG tasks across many languages, including low-resource ones. Specifically, we introduce a lightweight adapter set to enhance MLLMs with cross-lingual entity alignment and facts from MLKGs for many languages. Experiments on common benchmarks show that such enhancement benefits both the MLLM and MLKG, achieving: (1) comparable or improved performance for KG completion and entity alignment relative to baselines, especially for low-resource languages (for which KGs are unavailable); and (2) improved MLLM performance on language understanding tasks that require multilingual factual knowledge; all while maintaining performance on other general language tasks.", "track": "Multilinguality", "label": 13}, {"loc": [7.316677093505859, 9.588396072387695], "id": 2568, "title": "SepLL: Separating Latent Class Labels from Weak Supervision Noise", "authors": "Andreas Stephan, Vasiliki Kougia and Benjamin Roth", "abstract": "In the weakly supervised learning paradigm, labeling functions automatically assign heuristic, often noisy, labels to data samples. In this work, we provide a method for learning from weak labels by separating two types of complementary information associated with the labeling functions: information related to the target label and information specific to one labeling function only. Both types of information are reflected to different degrees by all labeled instances. In contrast to previous works that aimed at correcting or removing wrongly labeled instances, we learn a branched deep model that uses all data as-is, but splits the labeling function information in the latent space. Specifically, we propose the end-to-end model SepLL which extends a transformer classifier by introducing a latent space for labeling function specific and task-specific information. The learning signal is only given by the labeling functions matches, no pre-processing or label model is required for our method. Notably, the task prediction is made from the latent layer without any direct task signal. Experiments on Wrench text classification tasks show that our model is competitive with the state-of-the-art, and yields a new best average performance.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.786653518676758, 3.919642448425293], "id": 2573, "title": "Probing Relational Knowledge in Language Models via Word Analogies", "authors": "Kiamehr Rezaee and Jose Camacho-Collados", "abstract": "Understanding relational knowledge plays an integral part in natural language comprehension. When it comes to pre-trained language models (PLM), prior work has been focusing on probing relational knowledge this by filling the blanks in pre-defined prompts such as \"The capital of France is ---\". However, these probes may be affected by the co-occurrence of target relation words and entities (e.g. \"capital\", \"France\" and \"Paris\") in the pre-training corpus. In this work, we extend these probing methodologies leveraging analogical proportions as a proxy to probe relational knowledge in transformer-based PLMs without directly presenting the desired relation. In particular, we analysed the ability of PLMs to understand (1) the directionality of a given relation (e.g. Paris-France is not the same as France-Paris); (2) the ability to distinguish types on a given relation (both France and Japan are countries); and (3) the relation itself (Paris is the capital of France, but not Rome). Our results show how PLMs are extremely accurate at (1) and (2), but have clear room for improvement for (3). To better understand the reasons behind this behaviour and mistakes made by PLMs, we provide an extended quantitative analysis based on relevant factors such as frequency.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.153226852416992, 8.739192962646484], "id": 2575, "title": "Semi-Supervised Lifelong Language Learning", "authors": "Yingxiu Zhao, Yinhe Zheng, Bowen Yu, Zhiliang Tian, Dongkyu Lee, Jian Sun, Yongbin Li and Nevin L. Zhang", "abstract": "Lifelong learning aims to accumulate knowledge and alleviate catastrophic forgetting when learning tasks sequentially. However, existing lifelong language learning methods only focus on the supervised learning setting. Unlabeled data, which can be easily accessed in real-world scenarios, are underexplored. In this paper, we explore a novel setting, semi-supervised lifelong language learning (SSLL), where a model learns sequentially arriving language tasks with both labeled and unlabeled data. We propose an unlabeled data enhanced lifelong learner to explore SSLL. Specially, we dedicate task-specific modules to alleviate catastrophic forgetting and design two modules to exploit unlabeled data: (1) a virtual supervision enhanced task solver is constructed on a teacher-student framework to mine the underlying knowledge from unlabeled data; and (2) a backward augmented learner is built to encourage knowledge transfer from newly arrived unlabeled data to previous tasks. Experimental results on various language tasks demonstrate our model's effectiveness and superiority over competitive baselines under the new setting SSLL.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.054038047790527, 9.469047546386719], "id": 2590, "title": "Parameter-free Automatically Prompting: A Latent Pseudo Label Mapping Model for Prompt-based Learning", "authors": "Jirui Qi, Richong Zhang, Junfan Chen, Jaein Kim and Yongyi Mao", "abstract": "Prompt-based learning has achieved excellent performance in few-shot learning by mapping the outputs of the pre-trained language model to the labels with the help of a label mapping component. Existing manual label mapping (MLM) methods achieve good results but heavily rely on expensive human knowledge. Automatic label mapping (ALM) methods that learn the mapping functions with extra parameters have shown their potentiality. However, no effective ALM model comparable to MLM methods is developed yet due to the limited data. In this paper, we propose a Latent Pseudo Label Mapping (LPLM) method that optimizes the label mapping without human knowledge and extra parameters. LPLM is built upon a probabilistic latent model and is iteratively self-improved with the EM-style algorithm. The empirical results demonstrate that our LPLM method is superior to the mainstream ALM methods and significantly outperforms the SOTA method in few-shot classification tasks. Moreover, LPLM also shows impressively better performance than the vanilla MLM method which requires extra task-specific prior knowledge.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [1.0461918115615845, 10.521834373474121], "id": 2592, "title": "Exploring Logographic Image for Chinese Aspect-based Sentiment Classification", "authors": "Xiabing Zhou, Renjie Feng, Xiaotong Jiang and Zhongqing Wang", "abstract": "In logographic languages like Chinese, word meanings are constructed using specific character formations, which can help to disambiguate word senses and are beneficial for sentiment classification. \nHowever, such knowledge is rarely explored in previous sentiment analysis methods. In this paper, we focus on exploring the logographic information for aspect-based sentiment classification in Chinese text. Specifically, we employ a logographic image to capture an internal morphological structure from the character sequence. The logographic image is also used to learn the external relations among context and aspect words. Furthermore, we propose a multimodal language model to explicitly incorporate a logographic image with review text for aspect-based sentiment classification in Chinese. Experimental results show that our method brings substantial performance improvement over strong baselines. The results also indicate that the logographic image is very important for exploring the internal structure and external relations from the character sequence.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [8.763411521911621, 7.828667640686035], "id": 2596, "title": "On the Role of Bidirectionality in Language Model Pre-Training", "authors": "Mikel Artetxe, Jingfei Du, Naman Goyal, Luke Zettlemoyer and Veselin Stoyanov", "abstract": "Prior work on language model pre-training has explored different architectures and learning objectives, but differences in data, hyperparameters and evaluation make a principled comparison difficult. In this work, we focus on bidirectionality as a key factor that differentiates existing approaches, and present a comprehensive study of its role in next token prediction, text infilling, zero-shot priming and fine-tuning. We propose a new framework that generalizes prior approaches, including fully unidirectional models like GPT, fully bidirectional models like BERT, and hybrid models like CM3 and prefix LM. Our framework distinguishes between two notions of bidirectionality (bidirectional context and bidirectional attention) and allows us to control each of them separately. We find that the optimal configuration is largely application-dependent (e.g., bidirectional attention is beneficial for fine-tuning and infilling, but harmful for next token prediction and zero-shot priming). We train models with up to 6.7B parameters, and find differences to remain consistent at scale. While prior work on scaling has focused on left-to-right autoregressive models, our results suggest that this approach comes with some trade-offs, and it might be worthwhile to develop very large bidirectional models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.694046974182129, 6.057868957519531], "id": 2614, "title": "You Are What You Talk About: Inducing Evaluative Topics for Personality Analysis", "authors": "Josip Juki\u0107, Iva Vukojevi\u0107 and Jan Snajder", "abstract": "Expressing attitude or stance toward entities and concepts is an integral part of human behavior and personality. Recently, evaluative language data has become more accessible with social media's rapid growth, enabling large-scale opinion analysis. However, surprisingly little research examines the relationship between personality and evaluative language. To bridge this gap, we introduce the notion of evaluative topics, obtained by applying topic models to pre-filtered evaluative text from social media. We then link evaluative topics to individual text authors to build their evaluative profiles. We apply evaluative profiling to Reddit comments labeled with personality scores and conduct an exploratory study on the relationship between evaluative topics and Big Five personality facets, aiming for a more interpretable, facet-level analysis. Finally, we validate our approach by observing correlations consistent with prior research in personality psychology.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [6.4767961502075195, 1.9007999897003174], "id": 2622, "title": "CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure", "authors": "Nuo Chen, Qiushi Sun, Renyu Zhu, Xiang Li, Xuesong Lu and Ming Gao", "abstract": "Code pre-trained models (CodePTMs) have recently demonstrated significant success in code intelligence. To interpret these models, some probing methods have been applied. However, these methods fail to consider the inherent characteristics of codes. In this paper, to address the problem, we propose a novel probing method CAT-probing to quantitatively interpret how CodePTMs attend code structure. We first denoise the input code sequences based on the token types pre-defined by the compilers to filter those tokens whose attention scores are too small. After that, we define a new metric CAT-score to measure the commonality between the token-level attention scores generated in CodePTMs and the pair-wise distances between corresponding AST nodes. The higher the CAT-score, the stronger the ability of CodePTMs to capture code structure. We conduct extensive experiments to integrate CAT-probing with representative CodePTMs for different programming languages. Experimental results show the effectiveness of CAT-probing in CodePTM interpretation. Our codes and data are publicly available at https://github.com/nchen909/CodeAttention.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.1843578815460205, 9.340736389160156], "id": 2623, "title": "Learning to Revise References for Faithful Summarization", "authors": "Griffin Adams, Han-Chin Shing, Qing Sun, Christopher Winestock, Kathleen McKeown and No\u00e9mie Elhadad", "abstract": "In real-world scenarios with naturally occurring datasets, reference summaries are noisy and may contain information that cannot be inferred from the source text. On large news corpora, removing low quality samples has been shown to reduce model hallucinations. Yet, for smaller, and/or noisier corpora, filtering is detrimental to performance. To improve reference quality while retaining all data, we propose a new approach: to selectively re-write unsupported reference sentences to better reflect source data. We automatically generate a synthetic dataset of positive and negative revisions by corrupting supported sentences and learn to revise reference sentences with contrastive learning. The intensity of revisions is treated as a controllable attribute so that, at inference, diverse candidates can be over-generated-then-rescored to balance faithfulness and abstraction. To test our methods, we extract noisy references from publicly available MIMIC-III discharge summaries for the task of hospital-course summarization, and vary the data on which models are trained. According to metrics and human evaluation, models trained on revised clinical references are much more faithful, informative, and fluent than models trained on original or filtered data.", "track": "Summarization", "label": 14}, {"loc": [5.655788421630859, 6.289561748504639], "id": 2627, "title": "Towards Intention Understanding in Suicidal Risk Assessment with Natural Language Processing", "authors": "Shaoxiong Ji", "abstract": "Recent applications of natural language processing techniques to suicidal ideation detection and risk assessment frame the detection or assessment task as a text classification problem. \nRecent advances have developed many models, especially deep learning models, to boost predictive performance.\nThough the performance (in terms of aggregated evaluation scores) is improving, this position paper urges that better intention understanding is required for reliable suicidal risk assessment with computational methods. \nThis paper reflects the state of natural language processing applied to suicide-associated text classification tasks, differentiates suicidal risk assessment and intention understanding, and points out potential limitations of sentiment features and pretrained language models in suicidal intention understanding.\nBesides, it urges the necessity for sequential intention understanding and risk assessment, discusses some critical issues in evaluation such as uncertainty, and studies the lack of benchmarks.", "track": "Theme Track", "label": 18}, {"loc": [5.494220733642578, 5.096099853515625], "id": 2631, "title": "On the Impact of Temporal Concept Drift on Model Explanations", "authors": "Zhixue Zhao, George Chrysostomou, Kalina Bontcheva and Nikolaos Aletras", "abstract": "Explanation faithfulness of model predictions in natural language processing is typically evaluated on held-out data from the same temporal distribution as the training data (i.e. synchronous settings). While model performance often deteriorates due to temporal variation (i.e. temporal concept drift), it is currently unknown how explanation faithfulness is impacted when the time span of the target data is different from the data used to train the model (i.e. asynchronous settings). For this purpose, we examine the impact of temporal variation on model explanations extracted by eight feature attribution methods and three select-then-predict models across six text classification tasks. Our experiments show that (i) faithfulness is not consistent under temporal variations across feature attribution methods (e.g. it decreases or increases depending on the method), with an attention-based method demonstrating the most robust faithfulness scores across datasets; and (ii) select-then-predict models are mostly robust in asynchronous settings with only small degradation in predictive performance. Finally, feature attribution methods show conflicting behavior when used in FRESH (i.e. a select-and-predict model) and for measuring sufficiency/comprehensiveness (i.e. as post-hoc methods), suggesting that we need more robust metrics to evaluate post-hoc explanation faithfulness. Code will be made publicly available.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.0448689460754395, 12.501559257507324], "id": 2668, "title": "Text-Only Training for Image Captioning using Noise-Injected CLIP", "authors": "David Nukrai, Ron Mokady and Amir Globerson", "abstract": "We consider the task of image-captioning using only the CLIP model and additional text data at training time and no additional captioned images. Our approach relies on the fact that CLIP is trained to make visual and textual embeddings similar. Therefore, we only need to learn how to translate CLIP textual embeddings back into text, and we can learn how to do this by learning a decoder for the frozen CLIP text encoder using only text. We argue that this intuition is \"almost correct\" because of a gap between the embedding spaces, and propose to rectify this via noise injection during training. We demonstrate the effectiveness of our approach by showing SOTA zero-shot image captioning across four benchmarks, including style transfer. \nCode, data, and models are available at https://github.com/DavidHuji/CapDec.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.737630844116211, 8.278707504272461], "id": 2678, "title": "Improving Sharpness-Aware Minimization with Fisher Mask for Better Generalization on Language Models", "authors": "Qihuang Zhong, Liang Ding, Li Shen, Peng Mi, Juhua Liu, Bo Du and Dacheng Tao", "abstract": "Fine-tuning large pretrained language models on a limited training corpus usually suffers from poor generalization. Prior works show that the recently-proposed sharpness-aware minimization (SAM) optimization method can improve the model generalization. However, SAM adds a perturbation to each model parameter equally (but not all parameters contribute equally to the optimization of training), which we argue is sub-optimal and will lead to excessive computation. In this paper, we propose a novel optimization procedure, namely FSAM, which introduces a Fisher mask to improve the efficiency and performance of SAM. In short, instead of adding perturbation to all parameters, FSAM uses the Fisher information to identity the important parameters and formulates a Fisher mask to obtain the sparse perturbation, i.e., making the optimizer focus on these important parameters. Experiments on various tasks in GLUE and SuperGLUE benchmarks show that FSAM consistently outperforms the vanilla SAM by 0.67~1.98 average score among four different pretrained models. We also empirically show that FSAM works well in other complex scenarios, e.g., fine-tuning on generation tasks or limited training data. Encouragingly, when training data is limited, FSAM improves the SAM by a large margin, i.e., up to 15.1.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.583014488220215, 4.497382164001465], "id": 2680, "title": "TINA: Textual Inference with Negation Augmentation", "authors": "Chadi Helwe, Simon Coumes, Chlo\u00e9 Clavel and Fabian Suchanek", "abstract": "Transformer-based language models achieve state-of-the-art results on several natural language processing tasks. One of these is textual entailment, i.e., the task of determining whether a premise logically entails a hypothesis. However, the models perform poorly on this task when the examples contain negations. In this paper, we propose a new definition of textual entailment that captures also negation. This allows us to develop TINA (Textual Inference with Negation Augmentation), a principled technique for negated data augmentation that can be combined with the unlikelihood loss function.\nOur experiments with different transformer-based models show that our method can significantly improve the performance of the models on textual entailment datasets with negation -- without sacrificing performance on datasets without negation.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [9.31785774230957, 6.230337142944336], "id": 2685, "title": "Improving Bilingual Lexicon Induction with Cross-Encoder Reranking", "authors": "Yaoyiran Li, Fangyu Liu, Ivan Vuli\u0107 and Anna Korhonen", "abstract": "Bilingual lexicon induction (BLI) with limited bilingual supervision is a crucial yet challenging task in multilingual NLP. Current state-of-the-art BLI methods rely on the induction of cross-lingual word embeddings (CLWEs) to capture cross-lingual word similarities; such CLWEs are obtained 1) via traditional static models (e.g., VecMap), or 2) by extracting type-level CLWEs from multilingual pretrained language models (mPLMs), or 3) through combining the former two options. In this work, we propose a novel semi-supervised post-hoc reranking method termed BLICEr (BLI with Cross-Encoder Reranking), applicable to any precalculated CLWE space, which improves their BLI capability. The key idea is to 'extract' cross-lingual lexical knowledge from mPLMs, and then combine it with the original CLWEs. This crucial step is done via 1) creating a word similarity dataset, comprising positive word pairs (i.e., true translations) and hard negative pairs induced from the original CLWE space, and then 2) fine-tuning an mPLM (e.g., mBERT or XLM-R) in a cross-encoder manner to predict the similarity scores. At inference, we 3) combine the similarity score from the original CLWE space with the score from the BLI-tuned cross-encoder. BLICEr establishes new state-of-the-art results on two standard BLI benchmarks spanning a wide spectrum of diverse languages: it substantially outperforms a series of strong baselines across the board. We also validate the robustness of BLICEr with different CLWEs.", "track": "Multilinguality", "label": 13}, {"loc": [2.1346378326416016, 4.778950214385986], "id": 2694, "title": "Mixed-modality Representation Learning and Pre-training for Joint Table-and-Text Retrieval in OpenQA", "authors": "Junjie Huang, Wanjun Zhong, Qian Liu, Ming Gong, Daxin Jiang and Nan Duan", "abstract": "Retrieving evidences from tabular and textual resources is essential for open-domain question answering (OpenQA), which provides more comprehensive information. However, training an effective dense table-text retriever is difficult due to the challenges of table-text discrepancy and data sparsity problem. To address the above challenges, we introduce an optimized OpenQA Table-Text Retriever (OTTeR) to jointly retrieve tabular and textual evidences. Firstly, we propose to enhance mixed-modality representation learning via two mechanisms: modality-enhanced representation and mixed-modality negative sampling strategy. Secondly, to alleviate data sparsity problem and enhance the general retrieval ability, we conduct retrieval-centric mixed-modality synthetic pre-training. Experimental results demonstrate that OTTeR substantially improves the performance of table-and-text retrieval on the OTT-QA dataset. Comprehensive analyses examine the effectiveness of all the proposed mechanisms. Besides, equipped with OTTeR, our OpenQA system achieves the state-of-the-art result on the downstream QA task, with 10.1\\% absolute improvement in terms of the exact match over the previous best system. \\footnote{All the code and data are available at \\url{https://github.com/Jun-jie-Huang/OTTeR}.}", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.984067916870117, 6.252177715301514], "id": 2705, "title": "The Effects of Corpus Choice and Morphosyntax on Multilingual Space Induction", "authors": "Vinit Ravishankar and Joakim Nivre", "abstract": "In an effort to study the inductive biases of language models, numerous studies have attempted to use linguistically motivated tasks as a proxy of sorts, wherein performance on these tasks would imply an inductive bias towards a specific linguistic phenomenon. In this study, we attempt to analyse the inductive biases of language models with respect to natural language phenomena, in the context of building multilingual embedding spaces.\n\nWe sample corpora from 2 sources in 15 languages and train language models on pseudo-bilingual variants of each corpus, created by duplicating each corpus and shifting token indices for half the resulting corpus. We evaluate the cross-lingual capabilities of these LMs, and show that while correlations with language families tend to be weak, other corpus-level characteristics, such as type-token ratio, tend to be more strongly correlated. Finally, we show that multilingual spaces can be built, albeit less effectively, even when additional destructive perturbations are applied to the training corpora, implying that (effectively) bag-of-words models also have an inductive bias that is sufficient for inducing multilingual spaces.", "track": "Multilinguality", "label": 13}, {"loc": [4.057755947113037, 7.518723487854004], "id": 2712, "title": "Modeling Complex Dialogue Mappings via Sentence Semantic Segmentation Guided Conditional Variational Auto-Encoder", "authors": "Bin Sun, Shaoxiong Feng, Yiwei Li, Weichao Wang, Fei Mi, Yitong Li and Kan Li", "abstract": "Complex dialogue mappings (CDM), including one-to-many and many-to-one mappings, tend to make dialogue models generate incoherent or dull responses, and modeling these mappings remains a huge challenge for neural dialogue systems. To alleviate these problems, methods like introducing external information, reconstructing the optimization function, and manipulating data samples are proposed, while they primarily focus on avoiding training with CDM, inevitably weakening the model's ability of understanding CDM in human conversations and limiting further improvements in model performance. This paper proposes a Sentence Semantic Segmentation guided Conditional Variational Auto-Encoder (SegCVAE) method which can model and take advantages of the CDM data. Specifically, to tackle the incoherent problem caused by one-to-many, SegCVAE uses response-related prominent semantics to constrained the latent variable. To mitigate the non-diverse problem brought by many-to-one, SegCVAE segments multiple prominent semantics to enrich the latent variables. Three novel components, Internal Separation, External Guidance, and Semantic Norms, are proposed to achieve SegCVAE. On dialogue generation tasks, both the automatic and human evaluation results show that SegCVAE achieves new state-of-the-art performance.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [2.9457786083221436, 7.1463518142700195], "id": 2715, "title": "Graph Embeddings for Argumentation Quality Assessment", "authors": "Santiago Marro, Elena Cabrio and Serena Villata", "abstract": "Argumentation is used by people both internally, by evaluating arguments and counterarguments to make sense of a situation and take a decision, and externally, e.g., in a debate, by exchanging arguments to reach an agreement or to promote an individual position. In this context, the assessment of the quality of the arguments is of extreme importance, as it strongly influences the evaluation of the overall argumentation, impacting on the decision making process. The automatic assessment of the quality of natural language arguments is recently attracting interest in the Argument Mining field. However, the issue of automatically assessing the quality of an argumentation largely remains a challenging unsolved task. Our contribution is twofold: first, we present a novel resource of 402 student persuasive essays, where three main quality dimensions (i.e., cogency, rhetoric, and reasonableness) have been annotated, leading to 1908 arguments tagged with quality facets; second, we address this novel task of argumentation quality assessment proposing a novel neural architecture based on graph embeddings, that combines both the textual features of the natural language arguments and the overall argument graph, i.e., considering also the support and attack relations holding among the arguments. Results on the persuasive essays dataset outperform state-of-the-art and standard baselines' performance.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [0.5103975534439087, 7.130259990692139], "id": 2737, "title": "SMiLE: Schema-augmented Multi-level Contrastive Learning for Knowledge Graph Link Prediction", "authors": "Miao Peng, Ben Liu, Qianqian Xie, Wenjie Xu, Hua Wang and Min Peng", "abstract": "Link prediction is the task of inferring missing links between entities in knowledge graphs. Embedding-based methods have shown effectiveness in addressing this problem by modeling relational patterns in triples. However, the link prediction task often requires contextual information in entity neighborhoods, while most existing embedding-based methods fail to capture it. Additionally, little attention is paid to the diversity of entity representations in different contexts, which often leads to false prediction results. In this situation, we consider that the schema of knowledge graph contains the specific contextual information, and it is beneficial for preserving the consistency of entities across contexts. In this paper, we propose a novel Schema-augmented Multi-level contrastive LEarning framework (SMiLE) to conduct knowledge graph link prediction. Specifically, we first exploit network schema as the prior constraint to sample negatives and pre-train our model by employing a multi-level contrastive learning method to yield both prior schema and contextual information. Then we fine-tune our model under the supervision of individual triples to learn subtler representations for link prediction. Extensive experimental results on four knowledge graph datasets with thorough analysis of each component demonstrate the effectiveness of our proposed framework against state-of-the-art baselines. The implementation of SMiLE is available at https://github.com/GKNL/SMiLE.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.161717414855957, 12.386127471923828], "id": 2744, "title": "Multilingual Multimodal Learning with Machine Translated Text", "authors": "Chen Qiu, Dan Onea\u021b\u0103, Emanuele Bugliarello, Stella Frank and Desmond Elliott", "abstract": "Most vision-and-language pretraining research focuses on English tasks. However, the creation of multilingual multimodal evaluation datasets (e.g. Multi30K, xGQA, XVNLI, and MaRVL) poses a new challenge in finding high-quality training data that is both multilingual and multimodal. In this paper, we investigate whether machine translating English multimodal data can be an effective proxy for the lack of readily available multilingual data. We call this framework TD-MML: Translated Data for Multilingual Multimodal Learning, and it can be applied to any multimodal dataset and model. We apply it to both pretraining and fine-tuning data with a state-of-the-art model. In order to prevent models from learning from low-quality translated text, we propose two metrics for automatically removing such translations from the resulting datasets. In experiments on five tasks across 20 languages in the IGLUE benchmark, we show that translated data can provide a useful signal for multilingual multimodal learning, both at pretraining and fine-tuning.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.783978223800659, 9.837563514709473], "id": 2760, "title": "Learning From the Source Document: Unsupervised Abstractive Summarization", "authors": "Haojie Zhuang, Wei Emma Zhang, Jian Yang, Congbo Ma, Yutong Qu and Quan Z. Sheng", "abstract": "Most of the state-of-the-art methods for abstractive text summarization are under supervised learning settings, while heavily relying on high-quality and large-scale parallel corpora. In this paper, we remove the need for reference summaries and present an unsupervised learning method SCR (Summarize, Contrast and Review) for abstractive summarization, which leverages contrastive learning and is the first work to apply contrastive learning for unsupervised abstractive summarization. Particularly, we use the true source documents as positive source document examples, and strategically generated fake source documents as negative source document examples to train the model to generate good summaries. Furthermore, we consider and improve the writing quality of the generated summaries by guiding them to be similar to human-written texts. The promising results on extensive experiments show that SCR outperforms other unsupervised abstractive summarization baselines, which demonstrates its effectiveness.", "track": "Summarization", "label": 14}, {"loc": [7.4969611167907715, 12.305801391601562], "id": 2763, "title": "How to Do Things without Words: Modeling Semantic Drift of Emoji", "authors": "Eyal Arviv and Oren Tsur", "abstract": "Emoji have become a significant part of our informal textual communication. Previous work, addressing the societal and linguistic functions of emoji, overlooked the relation between the semantics and the visual variations of the symbols. In this paper we model and analyze the semantic drift of emoji and discuss the features that may be contributing to the drift, some are unique to emoji and some are more general. Specifically, we explore the relations between graphical changes and semantic changes.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.287376403808594, 5.739569187164307], "id": 2769, "title": "Mind Your Bias: A Critical Review of Bias Detection Methods for Contextual Language Models", "authors": "Silke Husse and Andreas Spitz", "abstract": "The awareness and mitigation of biases are of fundamental importance for the fair and transparent use of contextual language models, yet they crucially depend on the accurate detection of biases as a precursor. Consequently, numerous bias detection methods have been proposed, which vary in their approach, the considered type of bias, and the data used for evaluation. However, while most detection methods are derived from the word embedding association test for static word embeddings, the reported results are heterogeneous, inconsistent, and ultimately inconclusive. To address this issue, we conduct a rigorous analysis and comparison of bias detection methods for contextual language models. Our results show that minor design and implementation decisions (or errors) have a substantial and often significant impact on the derived bias scores. Overall, we find the state of the field to be both worse than previously acknowledged due to systematic and propagated errors in implementations, yet better than anticipated since divergent results in the literature homogenize after accounting for implementation errors. Based on our findings, we conclude with a discussion of paths towards more robust and consistent bias detection methods.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.085168838500977, 9.3946533203125], "id": 2782, "title": "ZeroPrompt: Scaling Prompt-Based Pretraining to 1,000 Tasks Improves Zero-Shot Generalization", "authors": "Hanwei Xu, Yujun Chen, Yulun Du, Nan Shao, wang yanggang, Haiyu Li and Zhilin Yang", "abstract": "We propose a multitask pretraining approach ZeroPrompt for zero-shot generalization, focusing on task scaling and zero-shot prompting.\nWhile previous models are trained on only a few dozen tasks, we scale to 1,000 tasks for the first time using real-world data. This leads to a crucial discovery that task scaling can be an efficient alternative to model scaling; i.e., the model size has less impact on performance with an extremely large number of tasks. Our results show that task scaling can improve training efficiency by 30 times in FLOPs.\nEmpirically, ZeroPrompt substantially improves both the efficiency and the performance of zero-shot learning across a variety of academic and production datasets.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.3767037391662598, 7.324089050292969], "id": 2784, "title": "Semantic Role Labeling Meets Definition Modeling: Using Natural Language to Describe Predicate-Argument Structures", "authors": "Simone Conia, Edoardo Barba, Alessandro Scir\u00e8 and Roberto Navigli", "abstract": "One of the common traits of past and present approaches for Semantic Role Labeling (SRL) is that they rely upon discrete labels drawn from a predefined linguistic inventory to classify predicate senses and their arguments.\nHowever, we argue this need not be the case. In this paper, we present an approach that leverages Definition Modeling to introduce a generalized formulation of SRL as the task of describing predicate-argument structures using natural language definitions instead of discrete labels. Our novel formulation takes a first step towards placing interpretability and flexibility foremost, and yet our experiments and analyses on PropBank-style and FrameNet-style, dependency-based and span-based SRL also demonstrate that a flexible model with an interpretable output does not necessarily come at the expense of performance. We release our software for research purposes at https://github.com/SapienzaNLP/dsrl.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.932897567749023, 5.943874835968018], "id": 2787, "title": "Is anisotropy really the cause of BERT embeddings not being semantic?", "authors": "Alejandro Fuster Baggetto and Victor Fresno", "abstract": "In this paper we conduct a set of experiments aimed to improve our understanding of the lack of semantic isometry in BERT, i.e. the lack of correspondence between the embedding and meaning spaces of its contextualized word representations. Our empirical results show that, contrary to popular belief, the anisotropy is not the root cause of the poor performance of these contextual models' embeddings in semantic tasks. What does affect both the anisotropy and semantic isometry is a set of known biases: frequency, subword, punctuation, and case. For each one of them, we measure its magnitude and the effect of its removal, showing that these biases contribute but do not completely explain the phenomenon of anisotropy and lack of semantic isometry of these contextual language models.", "track": "Theme Track", "label": 18}, {"loc": [10.240006446838379, 6.948047161102295], "id": 2800, "title": "m^4 Adapter: Multilingual Multi-Domain Adaptation for Machine Translation with a Meta-Adapter", "authors": "Wen Lai, Alexandra Chronopoulou and Alexander Fraser", "abstract": "Multilingual neural machine translation models (MNMT) yield state-of-the-art performance when evaluated on data from a domain and language pair seen at training time. However, when a MNMT model is used to translate under domain shift or to a new language pair, performance drops dramatically. We consider a very challenging scenario: adapting the MNMT model both to a new domain and to a new language pair at the same time. In this paper, we propose m^4Adapter (Multilingual Multi-Domain Adaptation for Machine Translation with a Meta-Adapter), which combines domain and language knowledge using meta-learning with adapters. We present results showing that our approach is a parameter-efficient solution which effectively adapts a model to both a new language pair and a new domain, while outperforming other adapter methods. An ablation study also shows that our approach more effectively transfers domain knowledge across different languages and language information across different domains.", "track": "Machine Translation", "label": 10}, {"loc": [4.06857967376709, 3.9447531700134277], "id": 2806, "title": "Textual Enhanced Contrastive Learning for Solving Math Word Problems", "authors": "Yibin Shen, Qianying Liu, Zhuoyuan Mao, Fei Cheng and Sadao Kurohashi", "abstract": "Solving math word problems is the task that analyses the relation of quantities and requires an accurate understanding of contextual natural language information. Recent studies show that current models rely on shallow heuristics to predict solutions and could be easily misled by small textual perturbations. To address this problem, we propose a Textual Enhanced Contrastive Learning framework, which enforces the models to distinguish semantically similar examples while holding different mathematical logic. We adopt a self-supervised manner strategy to enrich examples with subtle textual variance by textual reordering or problem re-construction. We then retrieve the hardest to differentiate samples from both equation and textual perspectives and guide the model to learn their representations. Experimental results show that our method achieves state-of-the-art on both widely used benchmark datasets and also exquisitely designed challenge datasets in English and Chinese.", "track": "Question Answering", "label": 11}, {"loc": [10.606300354003906, 6.930465221405029], "id": 2846, "title": "What Do Compressed Multilingual Machine Translation Models Forget?", "authors": "Alireza Mohammadshahi, Vassilina Nikoulina, Alexandre Berard, Caroline Brun, James Henderson and Laurent Besacier", "abstract": "Recently, very large pre-trained models achieve state-of-the-art results in various natural language processing (NLP) tasks, but their size makes it more challenging to apply them in resource-constrained environments. Compression techniques allow to drastically reduce the size of the models and therefore their inference time with negligible impact on top-tier metrics. However, the general performance averaged across multiple tasks and/or languages may hide a drastic performance drop on under-represented features, which could result in the amplification of biases encoded by the models. \nIn this work, we assess the impact of compression methods on Multilingual Neural Machine Translation models (MNMT) for various language groups, gender, and semantic biases by extensive analysis of compressed models on different machine translation benchmarks, i.e. FLORES-101, MT-Gender, and DiBiMT. We show that the performance of under-represented languages drops significantly, while the average BLEU metric only slightly decreases. Interestingly, the removal of noisy memorization with compression leads to a significant improvement for some medium-resource languages. Finally, we demonstrate that compression amplifies intrinsic gender and semantic biases, even in high-resource languages.", "track": "Machine Translation", "label": 10}, {"loc": [4.281546592712402, 7.545773506164551], "id": 2849, "title": "Controllable Dialogue Simulation with In-context Learning", "authors": "Zekun Li, Wenhu Chen, Shiyang Li, Hong Wang, Jing Qian and Xifeng Yan", "abstract": "Building dialogue systems requires a large corpus of annotated dialogues. Such datasets are usually created via crowdsourcing, which is expensive and time-consuming. In this paper, we propose \\textsc{Dialogic}, a novel dialogue simulation method based on large language model in-context learning to automate dataset creation. Seeded with a few annotated dialogues, \\textsc{Dialogic} automatically selects in-context examples for demonstration and prompts GPT-3 to generate new dialogues and annotations in a controllable way. Our method can rapidly expand a small set of dialogue data with minimum or zero \\textit{human involvement} and \\textit{parameter update} and is thus much more cost-efficient and time-saving than crowdsourcing. Experimental results on the MultiWOZ dataset demonstrate that training a model on the simulated dialogues leads to even better performance than using the same amount of human-generated dialogues under the challenging low-resource settings, with as few as 85 dialogues as a seed. When the full training set is given, our method can still serve as an effective data augmentation method to further improve performance. Human evaluation results show that our simulated dialogues have near-human fluency and annotation accuracy. The code and data are available at \\textbf{\\url{https://github.com/Leezekun/dialogic}}.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [2.7393853664398193, 8.892248153686523], "id": 2856, "title": "Improving the Factual Correctness of Radiology Report Generation with Semantic Rewards", "authors": "Jean-Benoit Delbrouck, Pierre Chambon, Christian Bluethgen, Emily Tsai, Omar Almusa and Curtis Langlotz", "abstract": "Neural image-to-text radiology report generation systems offer the potential to improve radiology reporting by reducing the repetitive process of report drafting and identifying possible medical errors. These systems have achieved promising performance as measured by widely used NLG metrics such as BLEU and CIDEr. However, the current systems face important limitations. First, they present an increased complexity in architecture that offers only marginal improvements on NLG metrics. Secondly, these systems that achieve high performance on these metrics are not always factually complete or consistent due to both inadequate training and evaluation. Recent studies have shown the systems can be substantially improved by using new methods encouraging 1) the generation of domain entities consistent with the reference and 2) describing these entities in inferentially consistent ways. So far, these methods rely on weakly-supervised approaches (rule-based) and named entity recognition systems that are not specific to the chest X-ray domain. To overcome this limitation, we propose a new method, the RadGraph reward, to further improve the factual completeness and correctness of generated radiology reports. More precisely, we leverage the RadGraph dataset containing annotated chest X-ray reports with entities and relations between entities. On two open radiology report datasets, our system substantially improves the scores up to 14.2% and 25.3% on metrics evaluating the factual correctness and completeness of reports.", "track": "Natural Language Generation", "label": 6}, {"loc": [7.384744167327881, 7.112301349639893], "id": 2858, "title": "Recursive Neural Networks with Bottlenecks Diagnose (Non-)Compositionality", "authors": "Verna Dankers and Ivan Titov", "abstract": "A recent line of work in NLP focuses on the (dis)ability of models to generalise compositionally for artificial languages.\nHowever, when considering natural language tasks, the data involved is not strictly, or locally, compositional.\nQuantifying the compositionality of data is a challenging task, which has been investigated primarily for short utterances.\nWe use recursive neural models (Tree-LSTMs) with bottlenecks that limit the transfer of information between nodes.\nWe illustrate that comparing data's representations in models with and without the bottleneck can be used to produce a compositionality metric.\nThe procedure is applied to the evaluation of arithmetic expressions using synthetic data, and sentiment classification using natural language data.\nWe demonstrate that compression through a bottleneck impacts non-compositional examples disproportionately\nand then use the bottleneck compositionality metric (BCM) to distinguish compositional from non-compositional samples, yielding a compositionality ranking over a dataset.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.762787818908691, 6.923155784606934], "id": 2897, "title": "HumSet: Dataset of Multilingual Information Extraction and Classification for Humanitarian Crises Response", "authors": "Selim Fekih, Nicolo' Tamagnone, Benjamin Minixhofer, Ranjan Shrestha, Ximena Contla, Ewan Oglethorpe and Navid Rekabsaz", "abstract": "Timely and effective response to humanitarian crises requires quick and accurate analysis of large amounts of text data \u2013 a process that can highly benefit from expert-assisted NLP systems trained on validated and annotated data in the humanitarian response domain. To enable creation of such NLP systems, we introduce and release HumSet, a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. The dataset provides documents in three languages (English, French, Spanish) and covers a variety of humanitarian crises from 2018 to 2021 across the globe. For each document, HUMSET provides selected snippets (entries) as well as assigned classes to each entry annotated using common humanitarian information analysis frameworks. HUMSET also provides novel and challenging entry extraction and multi-label entry classification tasks. In this paper, we take a first step towards approaching these tasks and conduct a set of experiments on Pre-trained Language Models (PLM) to establish strong baselines for future research in this domain. The dataset is available at https://blog.thedeep.io/humset/.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.595967292785645, 7.622678756713867], "id": 2901, "title": "Viterbi Decoding of Directed Acyclic Transformer for Non-Autoregressive Machine Translation", "authors": "Chenze Shao, Zhengrui Ma and Yang Feng", "abstract": "Non-autoregressive models achieve significant decoding speedup in neural machine translation but lack the ability to capture sequential dependency. Directed Acyclic Transformer (DA-Transformer) was recently proposed to model sequential dependency with a directed acyclic graph. Consequently, it has to apply a sequential decision process at inference time, which harms the global translation accuracy. In this paper, we present a Viterbi decoding framework for DA-Transformer, which guarantees to find the joint optimal solution for the translation and decoding path under any length constraint. Experimental results demonstrate that our approach consistently improves the performance of DA-Transformer while maintaining a similar decoding speedup.", "track": "Machine Translation", "label": 10}, {"loc": [6.90501070022583, 6.350910663604736], "id": 2907, "title": "Lexical Generalization Improves with Larger Models and Longer Training", "authors": "Elron Bandel, Yoav Goldberg and Yanai Elazar", "abstract": "While fine-tuned language models perform well on many language tasks, they were also shown to rely on superficial surface features such as lexical overlap. Excessive utilization of such heuristics can lead to failure on challenging inputs. We analyze the use of lexical overlap heuristics in natural language inference, paraphrase detection, and reading comprehension (using a novel contrastive dataset),\nand find that larger models are much less susceptible to adopting lexical overlap heuristics. We also find that longer training leads models to abandon lexical overlap heuristics. Finally, We provide evidence that the disparity between models size has its source in the pre-trained model.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.461104393005371, 4.851218223571777], "id": 2920, "title": "Realistic Data Augmentation Framework for Enhancing Tabular Reasoning", "authors": "Dibyakanti Kumar, Vivek Gupta, Soumya Sharma and Shuo Zhang", "abstract": "Existing approaches to constructing training data for Natural Language Inference (NLI) tasks, such as for semi-structured table reasoning, are either via crowdsourcing or fully automatic methods. However, the former is expensive and time consuming and thus limits scale, and the latter often produces naive examples that may lack complex reasoning. This paper develops a realistic semi-automated framework for data augmentation for tabular inference. Instead of manually generating a hypothesis for each table, our methodology generates hypothesis templates transferable to similar tables. In addition, our framework entails the creation of rational counterfactual tables based on human written logical constraints and premise paraphrasing. For our case study, we use the INFOTABS (Gupta et al., 2020), which is an entity centric tabular inference dataset. We observed that our framework could generate human-like tabular inference examples, which could benefit training data augmentation, especially in the scenario with limited supervision.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.807241439819336, 6.0446038246154785], "id": 2930, "title": "Inducing Generalizable and Interpretable Lexica", "authors": "Yilin Geng, Zetian Wu, Roshan Santhosh, Tejas Srivastava, Lyle Ungar and Jo\u00e3o Sedoc", "abstract": "Lexica \u2013 words and associated scores \u2013 are widely used as simple, interpretable, generalizable language features to predict sentiment, emotions, mental health, and personality. They also provide insight into the psychological features behind those moods and traits. Such lexica, historically created by human experts, are valuable to linguists, psychologists, and social scientists, but they take years of refinement and have limited coverage. In this paper, we investigate how the lexica that provide psycholinguistic insights could be computationally induced and how they should be assessed. We identify generalizability and interpretability as two essential properties of such lexica. We induce lexica using both context-oblivious and context-aware approaches, compare their predictive performance both within the training corpus and across various corpora, and evaluate their quality using crowd-worker assessment. We find that lexica induced from context-oblivious models are more generalizable and interpretable than those from more accurate context-aware transformer models. In addition, lexicon scores can identify explanatory words more reliably than a high performing transformer with feature-importance measures like SHAP.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.066840648651123, 6.204687118530273], "id": 2931, "title": "The Curious Case of Absolute Position Embeddings", "authors": "Koustuv Sinha, Amirhossein Kazemnejad, Siva Reddy, Joelle Pineau, Dieuwke Hupkes and Adina Williams", "abstract": "Transformer language models encode the notion of word order using positional information. Most commonly, this positional information is represented by absolute position embeddings (APEs), that are learned from the pretraining data. However, in natural language, it is not absolute position that matters, but relative position, and the extent to which APEs can capture this type of information has not been studied. In this work, we observe that models trained with APE over-rely on positional information to the point that they break-down when subjected to sentences with shifted position information. Specifically, when models are subjected to sentences starting from a non-zero position (excluding the effect of priming), they exhibit noticeably degraded performance on zero- to full-shot tasks, across a range of model families and model sizes. Our findings raise questions about the efficacy of APEs to model the relativity of position information, and invite further introspection on the sentence and word order processing strategies employed by these models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.683578968048096, 10.633499145507812], "id": 2943, "title": "Goal-oriented Vision-and-Dialog Navigation via Reinforcement Learning", "authors": "Yan Cao, Keting Lu, David DeFazio and Shiqi Zhang", "abstract": "Vision-and-dialog navigation is a recent benchmark for evaluating the AI capabilities of perception, interaction, and decision making. While existing methods developed for this benchmark have demonstrated great successes, they mostly rely on large datasets, where data collection can be a challenge, and the learned policies are not adaptive to domain changes. In this paper, we focus on a new problem, referred to as goal-oriented vision-and-dialog navigation (GVDN), where an agent uses reinforcement learning techniques to compute dialog-navigation policies from trial and error. A robot conducts visual navigation to locate target objects, and can talk to a remote human operator as needed. Our remote human is able to provide guidance on navigation only if the robot correctly conveys its location through dialog. Experiments have been conducted using photo-realistic simulation environments. Results suggest that, our agent outperforms competitive baselines in success rate.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.398736953735352, 4.777635097503662], "id": 2947, "title": "Leveraging Data Recasting to Enhance Tabular Reasoning", "authors": "Aashna Jena, Vivek Gupta, Manish Shrivastava and Julian Martin Eisenschlos", "abstract": "Creating challenging tabular inference data is essential for learning complex reasoning. Prior work has mostly relied on two data generation strategies. The first is human annotation, which yields linguistically diverse data but is difficult to scale. The second category for creation is synthetic generation, which is scalable and cost effective but lacks inventiveness. In this research, we present a framework for semi-automatically recasting existing tabular data to make use of the benefits of both approaches. We utilize our framework to build tabular NLI instances from five datasets that were initially intended for tasks like table2text creation, tabular Q/A, and semantic parsing. We demonstrate that recasted data could be used as evaluation benchmarks as well as augmentation data to enhance performance on tabular NLI tasks. Furthermore, we investigate the effectiveness of models trained on recasted data in the zero-shot scenario, and analyse trends in performance across different recasted datasets types.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [2.424914836883545, 8.633315086364746], "id": 2951, "title": "Thinking about GPT-3 In-Context Learning for Biomedical IE? Think Again", "authors": "Bernal Jimenez Gutierrez, Nikolas McNeal, Clayton B. Washington, You Chen, Lang Li, Huan Sun and Yu Su", "abstract": "Large pre-trained language models (PLMs) such as GPT-3 have shown strong in-context learning capabilities, which are highly appealing for domains such as biomedicine that feature high and diverse demands of language technologies but also high data annotation costs. In this paper, we present the first systematic and comprehensive study to compare the few-shot performance of GPT-3 in-context learning with fine-tuning smaller (i.e., BERT-sized) PLMs on two representative biomedical information extraction (IE) tasks: named entity recognition and relation extraction. We follow the true few-shot setting to avoid overestimating models' few-shot performance by model selection over a large validation set. We also optimize GPT-3's performance with known techniques such as contextual calibration and dynamic in-context example retrieval. However, our results show that GPT-3 still significantly underperforms compared to simply fine-tuning a smaller PLM. In addition, GPT-3 in-context learning also yields smaller gains in accuracy when more training data becomes available. More in-depth analyses further reveal issues of in-context learning that may be detrimental to IE tasks in general. Given the high cost of experimenting with GPT-3, we hope our study provides helpful guidance for biomedical researchers and practitioners towards more practical solutions such as fine-tuning small PLMs before better in-context learning is available for biomedical IE.", "track": "Information Extraction", "label": 5}, {"loc": [7.010500907897949, 6.301388263702393], "id": 2956, "title": "Attention weights accurately predict language representations in the brain", "authors": "Mathis Lamarre, Catherine Chen and Fatma Deniz", "abstract": "In Transformer-based language models (LMs) the attention mechanism converts token embeddings into contextual embeddings that incorporate information from neighboring words. The resulting contextual hidden state embeddings have enabled highly accurate models of brain responses, suggesting that the attention mechanism constructs contextual embeddings that carry information reflected in language-related brain representations. However, it is unclear whether the attention weights that are used to integrate information across words are themselves related to language representations in the brain. To address this question we analyzed functional magnetic resonance imaging (fMRI) recordings of participants reading English language narratives. We provided the narrative text as input to two LMs (BERT and GPT-2) and extracted their corresponding attention weights. We then used encoding models to determine how well attention weights can predict recorded brain responses. We find that attention weights accurately predict brain responses in much of the frontal and temporal cortices. Our results suggest that the attention mechanism itself carries information that is reflected in brain representations. Moreover, these results indicate cortical areas in which context integration may occur.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [7.439350605010986, 5.8168158531188965], "id": 2967, "title": "Improving HowNet-Based Chinese Word Sense Disambiguation with Translations", "authors": "xiang zhang, Bradley Hauer and Grzegorz Kondrak", "abstract": "Word sense disambiguation (WSD) is the task of identifying the intended sense of a word in context. While prior work on unsupervised WSD has leveraged lexical knowledge bases, such as WordNet and BabelNet, these resources have proven to be less effective for Chinese. Instead, the most widely used lexical knowledge base for Chinese is HowNet. Previous HowNet-based WSD methods have not exploited contextual translation information. In this paper, we present the first HowNet-based WSD system which combines monolingual contextual information from a pretrained neural language model with bilingual information obtained via machine translation and sense translation information from HowNet. The results of our evaluation experiment on a test set from prior work demonstrate that our new method achieves a new state of the art for unsupervised Chinese WSD.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [2.145115852355957, 7.550420761108398], "id": 2970, "title": "Mask-then-Fill: A Flexible and Effective Data Augmentation Framework for Event Extraction", "authors": "Jun Gao, Changlong Yu, Wei Wang, Huan Zhao and Ruifeng Xu", "abstract": "We present Mask-then-Fill, a flexible and effective data augmentation framework for event extraction. Our approach allows for more flexible manipulation of text and thus can generate more diverse data while keeping the original event structure unchanged as much as possible. Specifically, it first randomly masks out an adjunct sentence fragment and then infills a variable-length text span with a fine-tuned infilling model. The main advantage lies in that it can replace a fragment of arbitrary length in the text with another fragment of variable length, compared to the existing methods which can only replace a single word or a fixed-length fragment. On trigger and argument extraction tasks, the proposed framework is more effective than baseline methods and it demonstrates particularly strong results in the low-resource setting. Our further analysis shows that it achieves a good balance between diversity and distributional similarity.", "track": "Information Extraction", "label": 5}, {"loc": [4.655350208282471, 9.159080505371094], "id": 2972, "title": "MOBA-E2C: Generating MOBA Game Commentaries via Capturing Highlight Events from the Meta-Data", "authors": "Dawei Zhang, Sixing Wu, Yao Guo and Xiangqun Chen", "abstract": "MOBA (Multiplayer Online Battle Arena) games such as Dota2 are currently one of the most popular e-sports gaming genres. Following professional commentaries is a great way to understand and enjoy a MOBA game. However, massive game competitions lack commentaries because of the shortage of professional human commentators. As an alternative, employing machine commentators that can work at any time and place is a feasible solution. \nConsidering the challenges in modeling MOBA games, we propose a data-driven MOBA commentary generation framework, MOBA-E2C, allowing a model to generate commentaries based on the game meta-data. Subsequently, to alleviate the burden of collecting supervised data, we propose a MOBA-FuseGPT generator to generate MOBA game commentaries by fusing the power of a rule-based generator and a generative GPT generator. Finally, in the experiments, we take a popular MOBA game Dota2 as our case and construct a Chinese Dota2 commentary generation dataset Dota2-Commentary. Experimental results demonstrate the superior performance of our approach. To the best of our knowledge, this work is the first Dota2 machine commentator and Dota2-Commentary is the first dataset.", "track": "NLP Applications", "label": 0}, {"loc": [6.021492958068848, 8.31048583984375], "id": 2977, "title": "Enhancing Automatic Readability Assessment with Pre-training and Soft Labels for Ordinal Regression", "authors": "Jinshan Zeng, Yudong Xie, Xianglong Yu, John Lee and Ding-Xuan Zhou", "abstract": "The readability assessment task aims to assign a difficulty grade to a text. While neural models have recently demonstrated impressive performance, most do not exploit the ordinal nature of the difficulty grades, and make little effort for model initialization to facilitate fine-tuning. We address these limitations with soft labels for ordinal regression, and with model pre-training through prediction of pairwise relative text difficulty. We incorporate these two components into a model based on hierarchical attention networks, and evaluate its performance on both English and Chinese datasets. Experimental results show that our proposed model outperforms competitive neural models and statistical classifiers on most datasets.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.632043361663818, 6.651770114898682], "id": 2985, "title": "Opening up Minds with Argumentative Dialogues", "authors": "Youmna Farag, Charlotte Brand, Jacopo Amidei, Paul Piwek, Tom Stafford, Svetlana Stoyanchev and Andreas Vlachos", "abstract": "Recent research on argumentative dialogues has focused on persuading people to take some action, changing their stance on the topic of discussion, or winning debates. In this work, we focus on argumentative dialogues that aim to open up (rather than change) people's minds to help them become more understanding to views that are unfamiliar or in opposition to their own convictions. To this end, we present a dataset of $183$ argumentative dialogues about $3$ controversial topics: veganism, Brexit and COVID-19 vaccination. The dialogues were collected using the Wizard of Oz approach, where wizards leverage a knowledge-base of arguments to converse with participants. Open-mindedness is measured before and after engaging in the dialogue using a questionnaire from the psychology literature, and success of the dialogue is measured as the change in the participant's stance towards those who hold opinions different to theirs. We evaluate two dialogue models: a Wikipedia-based and an argument-based model. We show that while both models perform closely in terms of opening up minds, the argument-based model is significantly better on other dialogue properties such as engagement and clarity.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [5.2706990242004395, 5.317431449890137], "id": 3018, "title": "You Are My Type! Type Embeddings for Pre-trained Language Models", "authors": "Mohammed Adel Saeed and Paolo Papotti", "abstract": "One reason for the positive impact of Pre-trained Language Models (PLMs) in NLP tasks is their ability to encode semantic types, such as `European City' or `Woman'. While previous work has analyzed such information in the context of interpretability, it is not clear how to use types to steer the PLM output. For example, in a cloze statement, it is desirable to steer the model to generate a token that satisfies a user-specified type, e.g., predict a date rather than a location. In this work, we introduce Type Embeddings (TEs), an input embedding that promotes desired types in a PLM. Our proposal is to define a type by a small set of word examples. We empirically study the ability of TEs both in representing types and in steering masking predictions without changes to the prompt text in BERT. Finally, using the LAMA datasets, we show how TEs highly improve the precision in extracting facts from PLMs.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.635558128356934, 3.6308112144470215], "id": 3037, "title": "Generating Textual Adversaries with Minimal Perturbation", "authors": "Xingyi Zhao, Lu Zhang, Depeng Xu and Shuhan Yuan", "abstract": "Many word-level adversarial attack approaches for textual data have been proposed in recent studies. However, due to the massive search space consisting of combinations of candidate words, the existing approaches face the problem of preserving the semantics of texts when crafting adversarial counterparts. In this paper, we develop a novel attack strategy to find adversarial texts with high similarity to the original texts while introducing minimal perturbation. The rationale is that we expect the adversarial texts with small perturbation can better preserve the semantic meaning of original texts. Experiments show that, compared with state-of-the-art attack approaches, our approach achieves higher success rates and lower perturbation rates in four benchmark datasets.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.068790435791016, 5.832129001617432], "id": 3038, "title": "SensePOLAR: Word sense aware interpretability for pre-trained contextual word embeddings", "authors": "Jan Engler, Sandipan Sikdar, Marlene Lutz and Markus Strohmaier", "abstract": "Adding interpretability to word embeddings represents an area of active research in text\nrepresentation. Recent work has explored the potential of embedding words via so-called polar\ndimensions (e.g. good vs. bad, correct vs. wrong). Examples of such recent approaches\ninclude SemAxis, POLAR, FrameAxis, and BiImp. Although these approaches provide interpretable\ndimensions for words, they have not been designed to deal with polysemy, i.e. they can not easily distinguish between different senses of words. To address this limitation, we present SensePOLAR, an extension of the original POLAR framework that enables wordsense aware interpretability for pre-trained contextual word embeddings. The resulting interpretable word embeddings achieve a level of\nperformance that is comparable to original contextual word embeddings across a variety of\nnatural language processing tasks including the GLUE and SQuAD benchmarks. Our work\nremoves a fundamental limitation of existing approaches by offering users sense aware interpretations\nfor contextual word embeddings.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.813555717468262, 5.833791732788086], "id": 3043, "title": "Contextualizing Language Models for Norms Diverging from Social Majority", "authors": "Niklas Kiehne, Hermann Kroll and Wolf-Tilo Balke", "abstract": "To comprehensibly contextualize decisions, artificial systems in social situations need a high degree of awareness of the rules of conduct of human behavior. Especially transformer-based language models have recently been shown to exhibit some such awareness. But what if norms in some social setting do not adhere to or even blatantly deviate from the mainstream? \n In this paper, we introduce a novel mechanism based on deontic logic to allow for a flexible adaptation of individual norms by de-biasing training data sets and a task-reduction to textual entailment. Building on the popular 'Moral Stories' dataset we on the one hand highlight the intrinsic bias of current language models, on the other hand characterize the adaptability of pre-trained models to deviating norms in fine-tuning settings.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.767204761505127, 6.926342487335205], "id": 3046, "title": "Empathetic Dialogue Generation via Sensitive Emotion Recognition and Sensible Knowledge Selection", "authors": "Lanrui Wang, Jiangnan Li, Zheng Lin, Fandong Meng, Chenxu Yang, Weiping Wang and Jie Zhou", "abstract": "Empathy, which is widely used in psychological counseling, is a key trait of everyday human conversations. Equipped with commonsense knowledge, current approaches to empathetic response generation focus on capturing implicit emotion within dialogue context, where the emotions are treated as a static variable throughout the conversations. However, emotions change dynamically between utterances, which makes previous works difficult to perceive the emotion flow and predict the correct emotion of the target response, leading to inappropriate response. Furthermore, simply importing commonsense knowledge without harmonization may trigger the conflicts between knowledge and emotion, which confuse the model to choose the correct information to guide the generation process. To address the above problems, we propose a Serial Encoding and Emotion-Knowledge interaction (SEEK) method for empathetic dialogue generation. We use a fine-grained encoding strategy which is more sensitive to the emotion dynamics (emotion flow) in the conversations to predict the emotion-intent characteristic of response. Besides, we design a novel framework to model the interaction between knowledge and emotion to solve the conflicts generate more sensible response. Extensive experiments on the utterance-level annotated EMPATHETICDIALOGUES demonstrate that SEEK outperforms the strong baseline in both automatic and manual evaluations.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.5323992967605591, 6.972627639770508], "id": 3077, "title": "Joint Multilingual Knowledge Graph Completion and Alignment", "authors": "Vinh Tong, Dat Quoc Nguyen, Trung Thanh Huynh, Tam Thanh Nguyen, Quoc Viet Hung Nguyen and Mathias Niepert", "abstract": "Knowledge graph (KG) alignment and completion are usually treated as two independent tasks. While recent work has leveraged entity and relation alignments from multiple KGs, such as alignments between multilingual KGs with common entities and relations, a deeper understanding of the ways in which multilingual KG completion (MKGC) can aid the creation of multilingual KG alignments (MKGA) is still limited. Motivated by the observation that structural inconsistencies -- the main challenge for MKGA models -- can be mitigated through KG completion methods, we propose a novel model for jointly completing and aligning knowledge graphs. The proposed model combines two components that jointly accomplish KG completion and alignment. These two components employ relation-aware graph neural networks that we propose to encode multi-hop neighborhood structures into entity and relation representations. Moreover, we also propose (i) a structural inconsistency reduction mechanism to incorporate information from the completion into the alignment component, and (ii) an alignment seed enlargement and triple transferring mechanism to enlarge alignment seeds and transfer triples during KGs alignment. Extensive experiments on a public multilingual benchmark show that our proposed model outperforms existing competitive baselines, obtaining new state-of-the-art results on both MKGC and MKGA tasks.", "track": "Information Extraction", "label": 5}, {"loc": [2.885392189025879, 4.6865739822387695], "id": 3086, "title": "A Framework for Automatic Generation of Spoken Question-Answering Data", "authors": "Merve \u00dcnl\u00fc Menev\u015fe, Yusufcan Manav, Ebru Arisoy and Arzucan \u00d6zg\u00fcr", "abstract": "This paper describes a framework to automatically generate a spoken question answering (QA) dataset. The framework consists of a question generation (QG) module to generate questions automatically from given text documents, a text-to-speech (TTS) module to convert the text documents into spoken form and an automatic speech recognition (ASR) module to transcribe the spoken content. The final dataset contains question-answer pairs for both the reference text and ASR transcriptions as well as the audio files corresponding to each reference text. For QG and ASR systems we used pre-trained multilingual encoder-decoder transformer models and fine-tuned these models using a limited amount of manually generated QA data and TTS-based speech data, respectively. As a proof of concept, we investigated the proposed framework for Turkish and generated the Turkish Question Answering (TurQuAse) dataset using Wikipedia articles. Manual evaluation of the automatically generated question- answer pairs and QA performance evaluation with state of-the-art models on TurQuAse show that the proposed framework is efficient for automatically generating spoken QA datasets. To the best of our knowledge, TurQuAse is the first publicly available spoken question answering dataset for Turkish. The proposed framework can be easily extended to other languages where a limited amount of QA data is available.", "track": "Question Answering", "label": 11}, {"loc": [3.261929750442505, 9.424206733703613], "id": 3092, "title": "Readability Controllable Biomedical Document Summarization", "authors": "Zheheng Luo, Qianqian Xie and Sophia Ananiadou", "abstract": "Different from general documents, it is recognised that \nthe ease with which people can understand a biomedical text is eminently varied, owing to the highly technical nature of biomedical documents and the variance of readers' domain knowledge. \nHowever, existing biomedical document summarization systems have paid little attention to readability control, leaving users with summaries that are incompatible with their levels of expertise.\nIn recognition of this urgent demand, we introduce a new task of readability controllable summarization for biomedical documents, which aims to recognise users' readability demands and generate summaries that better suit their needs: technical summaries for experts and plain language summaries (PLS) for laymen.\nTo establish this task, we construct a corpus consisting of biomedical papers with technical summaries and PLSs written by the authors, and benchmark multiple advanced controllable abstractive and extractive summarization models based on pre-trained language models (PLMs) with prevalent controlling and generation techniques.\nMoreover, we propose a novel masked language model (MLM) based metric and its variant to effectively evaluate the readability discrepancy between lay and technical summaries.\nExperimental results from automated and human evaluations show that though current control techniques allow for a certain degree of readability adjustment during generation, the performance of existing controllable summarization methods is far from desirable in this task.", "track": "Summarization", "label": 14}, {"loc": [6.335440158843994, 12.204106330871582], "id": 3096, "title": "Beyond Additive Fusion: Learning Non-Additive Multimodal Interactions", "authors": "Torsten W\u00f6rtwein, Lisa B. Sheeber, Nicholas Allen, Jeffrey Cohn and Louis-Philippe Morency", "abstract": "Multimodal fusion addresses the problem of analyzing spoken words in the multimodal context, including visual expressions and prosodic cues. Even when multimodal models lead to performance improvements, it is often unclear whether bimodal and trimodal interactions are learned or whether modalities are processed independently of each other. We propose Multimodal Residual Optimization (MRO) to separate unimodal, bimodal, and trimodal interactions in a multimodal model. This improves interpretability as the multimodal interaction can be quantified. Inspired by Occam's razor, the main intuition of MRO is that (simpler) unimodal contributions should be learned before learning (more complex) bimodal and trimodal interactions. For example, bimodal predictions should learn to correct the mistakes (residuals) of unimodal predictions, thereby letting the bimodal predictions focus on the remaining bimodal interactions. Empirically, we observe that MRO successfully separates unimodal, bimodal, and trimodal interactions while not degrading predictive performance. We complement our empirical results with a human perception study and observe that MRO learns multimodal interactions that align with human judgments.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.4007134437561035, 12.143303871154785], "id": 3109, "title": "Generalization Differences between End-to-End and Neuro-Symbolic Vision-Language Reasoning Systems", "authors": "Wang Zhu, Jesse Thomason and Robin Jia", "abstract": "For vision-and-language reasoning tasks, both fully connectionist, end-to-end methods and hybrid, neuro-symbolic methods have achieved high in-distribution performance. In which out-of-distribution settings does each paradigm excel? We investigate this question on both single-image and multi-image visual question-answering through four types of generalization tests: a novel segment-combine test for multi-image queries, contrast set, compositional generalization, and cross-benchmark transfer.\nVision-and-language end-to-end trained systems exhibit sizeable performance drops across all these tests. Neuro-symbolic methods suffer even more on cross-benchmark transfer from GQA to VQA, but they show smaller accuracy drops on the other generalization tests and their performance quickly improves by few-shot training. Overall, our results demonstrate the complementary benefits of these two paradigms, and emphasize the importance of using a diverse suite of generalization tests to fully characterize model robustness to distribution shift.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.9796366691589355, 12.407997131347656], "id": 3111, "title": "Learning to Model Multimodal Semantic Alignment for Story Visualization", "authors": "Bowen Li and Thomas Lukasiewicz", "abstract": "Story visualization aims to generate a sequence of images to narrate each sentence in a multi-sentence story, where the images should be realistic and keep global consistency across dynamic scenes and characters. Current works face the problem of semantic misalignment because of their fixed architecture and diversity of input modalities. To address this problem, we explore the semantic alignment between text and image representations by learning to match their semantic levels in the GAN-based generative model. More specifically, we introduce dynamic interactions according to learning to dynamically explore various semantic depths and fuse the different-modal information at a matched semantic level, which thus relieves the text-image semantic misalignment problem. Extensive experiments on different datasets demonstrate the improvements of our approach, neither using segmentation masks nor auxiliary captioning networks, on image quality and story consistency, compared with state-of-the-art methods.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.486021041870117, 5.509698390960693], "id": 3131, "title": "SciFact-Open: Towards open-domain scientific claim verification", "authors": "David Wadden, Kyle Lo, Bailey E. Kuehl, Arman Cohan, Iz Beltagy, Lucy Lu Wang and Hannaneh Hajishirzi", "abstract": "While research on scientific claim verification has led to the development of powerful systems that appear to approach human performance, these approaches have yet to be tested in a realistic setting against large corpora of scientific literature. Moving to this open-domain evaluation setting, however, poses unique challenges; in particular, it is infeasible to exhaustively annotate all evidence documents. In this work, we present SciFact-Open, a new test collection designed to evaluate the performance of scientific claim verification systems on a corpus of 500K research abstracts. Drawing upon pooling techniques from information retrieval, we collect evidence for scientific claims by pooling and annotating the top predictions of four state-of-the-art scientific claim verification models. We find that systems developed on smaller corpora struggle to generalize to SciFact-Open, exhibiting performance drops of at least 15 F1. In addition, analysis of the evidence in SciFact-Open reveals interesting phenomena likely to appear when claim verification systems are deployed in practice, e.g., cases where the evidence supports only a special case of the claim. Our dataset is available at https://github.com/dwadden/scifact-open.", "track": "NLP Applications", "label": 0}, {"loc": [10.748750686645508, 6.844636917114258], "id": 3133, "title": "COMET-QE and Active Learning for Low-Resource Machine Translation", "authors": "Everlyn Chimoto and Bruce Bassett", "abstract": "Active learning aims to deliver maximum benefit when resources are scarce. We use COMET-QE, a reference-free evaluation metric, to select sentences for low-resource neural machine translation. Using Swahili, Kinyarwanda and Spanish for our experiments, we show that COMET-QE significantly outperforms two variants of Round Trip Translation Likelihood (RTTL) and random sentence selection by up to 5 BLEU points for 20k sentences selected by Active Learning on a 30k baseline. This suggests that COMET-QE is a powerful tool for sentence selection in the very low-resource limit.", "track": "Machine Translation", "label": 10}, {"loc": [2.7532668113708496, 8.864214897155762], "id": 3136, "title": "MedicalSum: A Guided Clinical Abstractive Summarization Model for Generating Medical Reports from Patient-Doctor Conversations", "authors": "George Michalopoulos, Kyle Williams, Gagandeep Singh and Thomas Lin", "abstract": "We introduce MedicalSum, a transformer-based sequence-to-sequence architecture for summarizing medical conversations by integrating medical domain knowledge from the Unified Medical Language System (UMLS). The novel knowledge augmentation is performed in three ways: (i) introducing a guidance signal that consists of the medical words in the input sequence, (ii) leveraging semantic type knowledge in UMLS to create clinically meaningful input embeddings, and (iii) making use of a novel weighted loss function that provides a stronger incentive for the model to correctly predict words with a medical meaning. By applying these three strategies, MedicalSum takes clinical knowledge into consideration during the summarization process and achieves state-of-the-art ROUGE score improvements of 0.8-2.1 points (including 6.2% ROUGE-1 error reduction in the PE section) when producing medical summaries of patient-doctor conversations.", "track": "Summarization", "label": 14}, {"loc": [7.464713096618652, 9.526053428649902], "id": 3155, "title": "Leveraging Training Dynamics and Self-Training for Text Classification", "authors": "Tiberiu Sosea and Cornelia Caragea", "abstract": "The effectiveness of pre-trained language models in downstream tasks is highly dependent on the amount of labeled data available for training. Semi-supervised learning (SSL) is a promising technique that has seen wide attention recently due to its effectiveness in improving deep learning models when training data is scarce. Common approaches employ a teacher-student self-training framework, where a teacher network generates pseudo-labels for unlabeled data, which are then used to iteratively train a student network. In this paper, we propose a new self-training approach for text classification that leverages training dynamics of unlabeled data. We evaluate our approach on a wide range of text classification tasks, including emotion detection, sentiment analysis, question classification and gramaticality, which span a variety of domains, e.g, Reddit, Twitter, and online forums. Notably, our method is successful on all benchmarks, obtaining an average increase in F1 score of 3.5% over strong baselines in low resource settings.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.721792221069336, 4.54890251159668], "id": 3167, "title": "Learning to Infer from Unlabeled Data: A Semi-supervised Learning Approach for Robust Natural Language Inference", "authors": "Mobashir Sadat and Cornelia Caragea", "abstract": "Natural Language Inference (NLI) or Recognizing Textual Entailment (RTE) aims at predicting the relation between a pair of sentences (premise and hypothesis) as entailment, contradiction or semantic independence. Although deep learning models have shown promising performance for NLI in recent years, they rely on large scale expensive human-annotated datasets. Semi-supervised learning (SSL) is a popular technique for reducing the reliance on human annotation by leveraging unlabeled data for training. However, despite its substantial success on single sentence classification tasks where the challenge in making use of unlabeled data is to assign \"good enough\" pseudo-labels, for NLI tasks, the nature of unlabeled data is more complex: one of the sentences in the pair (usually the hypothesis) along with the class label are missing from the data and require human annotations, which makes SSL for NLI more challenging. In this paper, we propose a novel way to incorporate unlabeled data in SSL for NLI where we use a conditional language model, BART to generate the hypotheses for the unlabeled sentences (used as premises). Our experiments show that our SSL framework successfully exploits unlabeled data and substantially improves the performance of four NLI datasets in low-resource settings. We release our code here: https://github.com/msadat3/SSL_for_NLI", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.041497230529785, 2.985491991043091], "id": 3169, "title": "Unsupervised Text Deidentification", "authors": "John Morris, Justin Chiu, Ramin Zabih and Alexander Rush", "abstract": "Deidentification seeks to anonymize textual data prior to distribution. Automatic deidentification primarily uses supervised named entity recognition from human-labeled data points. We propose an unsupervised deidentification method that masks words that leak personally-identifying information. The approach utilizes a specially trained reidentification model to identify individuals from redacted personal documents. Motivated by K-anonymity based privacy, we generate redactions that ensure a minimum reidentification rank for the correct profile of the document. To evaluate this approach, we consider the task of deidentifying Wikipedia Biographies, and evaluate using an adversarial reidentification metric. Compared to a set of unsupervised baselines, our approach deidentifies documents more completely while removing fewer words. Qualitatively, we see that the approach eliminates many identifying aspects that would fall outside of the common named entity based approach.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [8.064698219299316, 3.122032642364502], "id": 3178, "title": "Federated Continual Learning for Text Classification via Selective Inter-client Transfer", "authors": "Yatin Chaudhary, Pranav Rai, Matthias Schubert, Hinrich Sch\u00fctze and Pankaj Gupta", "abstract": "In this work, we combine the two paradigms: Federated Learning (FL) and Continual Learning (CL) for text classification task in cloud-edge continuum. The objective of Federated Continual Learning (FCL) is to improve deep learning models over life time at each client by (relevant and efficient) knowledge transfer without sharing data. Here, we address challenges in minimizing inter-client interference while knowledge sharing due to heterogeneous tasks across clients in FCL setup. In doing so, we propose a novel framework, Federated Selective Inter-client Transfer (FedSeIT) which selectively combines model parameters of foreign clients. To further maximize knowledge transfer, we assess domain overlap and select informative tasks from the sequence of historical tasks at each foreign client while preserving privacy. Evaluating against the baselines, we show improved performance, a gain of (average) 12.4% in text classification over a sequence of tasks using five datasets from diverse domains. To the best of our knowledge, this is the first work that applies FCL to NLP.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.221919536590576, 7.763498783111572], "id": 3179, "title": "DOROTHIE: Spoken Dialogue for Handling Unexpected Situations in Interactive Autonomous Driving Agents", "authors": "Ziqiao Ma, Benjamin VanDerPloeg, Cristian-Paul Bara, Huang Yidong, Eui-In Kim, Felix Gervits, Matthew Marge and Joyce Chai", "abstract": "In the real world, autonomous driving agents navigate in highly dynamic environments full of unexpected situations where pre-trained models are unreliable. In these situations, what is immediately available to vehicles is often only human operators. Empowering autonomous driving agents with the ability to navigate in a continuous and dynamic environment and to communicate with humans through sensorimotor-grounded dialogue becomes critical. To this end, we introduce Dialogue On the ROad To Handle Irregular Events (DOROTHIE), a novel interactive simulation platform that enables the creation of unexpected situations on the fly to support empirical studies on situated communication with autonomous driving agents. Based on this platform, we created the Situated Dialogue Navigation (SDN), a navigation benchmark of 183 trials with a total of 8415 utterances, around 18.7 hours of control streams, and 2.9 hours of trimmed audio. SDN is developed to evaluate the agent's ability to predict dialogue moves from humans as well as generate its own dialogue moves and physical navigation actions. We further developed a transformer-based baseline model for these SDN tasks. Our empirical results indicate that language guided-navigation in a highly dynamic environment is an extremely difficult task for end-to-end models. These results will provide insight towards future work on robust autonomous driving agents", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.6027657985687256, 8.030424118041992], "id": 3183, "title": "He Said, She Said: Style Transfer for Shifting the Perspective of Dialogues", "authors": "Amanda Bertsch, Graham Neubig and Matthew R. Gormley", "abstract": "In this work, we define a new style transfer task: perspective shift, which reframes a dialouge from informal first person to a formal third person rephrasing of the text. This task requires challenging coreference resolution, emotion attribution, and interpretation of informal text. We explore several baseline approaches and discuss further directions on this task when applied to short dialogues. As a sample application, we demonstrate that applying perspective shifting to a dialogue summarization dataset (SAMSum) substantially improves the zero-shot performance of extractive news summarization models on this data. Additionally, supervised extractive models perform better when trained on perspective shifted data than on the original dialogues. We release our code publicly.", "track": "Natural Language Generation", "label": 6}, {"loc": [7.929487228393555, 9.61744213104248], "id": 3184, "title": "Dynamic Augmentation Data Selection for Few-shot Text Classification", "authors": "Guangliang Liu, Lifeng Jin, Owen Yuan and Jiayu Zhou", "abstract": "Data augmentation has been a popular method for fine-tuning pre-trained language models to increase model robustness and performance. With augmentation data coming from modifying gold train data (in-sample augmentation) or being harvested from general domain unlabeled data (out-of-sample augmentation), the quality of such data is the key to successful fine-tuning. In this paper, we propose a dynamic data selection method to select effective augmentation data from different augmentation sources according to the model's learning stage, by identifying a set of augmentation samples that optimally facilitates the learning process of the most current model. The method firstly filters out augmentation samples with noisy pseudo labels through a curriculum learning strategy, then estimates the effectiveness of reserved augmentation data by its influence scores on the current model at every update, allowing the data selection process tightly tailored to model parameters. And the two-stage augmentation strategy considers in-sample augmentation and out-of-sample augmentation in different learning stages. Experiments with both kinds of augmentation data on a variety of sentence classification tasks show that our method outperforms strong baselines, proving the effectiveness of our method. Analysis confirms the dynamic nature of the data effectiveness and the importance of model learning stages in utilization of augmentation data.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.0960869789123535, 9.631250381469727], "id": 3195, "title": "KPDROP: Improving Absent Keyphrase Generation", "authors": "Jishnu Ray Chowdhury, Seo Yeon Park, Tuhin Kundu and Cornelia Caragea", "abstract": "Keyphrase generation is the task of generating phrases (keyphrases) that summarize the main topics of a given document. Keyphrases can be either present or absent from the given document. While the extraction of present keyphrases has received much attention in the past, only recently a stronger focus has been placed on the generation of absent keyphrases. However, generating absent keyphrases is challenging; even the best methods show only a modest degree of success. In this paper, we propose a model-agnostic approach called keyphrase dropout (or KPDrop) to improve absent keyphrase generation. In this approach, we randomly drop present keyphrases from the document and turn them into artificial absent keyphrases during training. We test our approach extensively and show that it consistently improves the absent performance of strong baselines in both supervised and resource-constrained semi-supervised settings.", "track": "Information Extraction", "label": 5}, {"loc": [4.527041912078857, 4.499120712280273], "id": 3209, "title": "Natural Language Deduction through Search over Statement Compositions", "authors": "Kaj Bostrom, Zayne Sprague, Swarat Chaudhuri and Greg Durrett", "abstract": "In settings from fact-checking to question answering, we frequently want to know whether a collection of evidence (premises) entails a hypothesis. Existing methods primarily focus on the end-to-end discriminative version of this task, but less work has treated the generative version in which a model searches over the space of statements entailed by the premises to constructively derive the hypothesis. We propose a system for doing this kind of deductive reasoning in natural language by decomposing the task into separate steps coordinated by a search procedure, producing a tree of intermediate conclusions that faithfully reflects the system's reasoning process. Our experiments on the EntailmentBank dataset (Dalvi et al., 2021) demonstrate that the proposed system can successfully prove true statements while rejecting false ones. Moreover, it produces natural language explanations with a 17% absolute higher step validity than those produced by an end-to-end T5 model.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.466771125793457, 7.180026054382324], "id": 3216, "title": "EnDex: Evaluation of Dialogue Engagingness at Scale", "authors": "Guangxuan Xu, Ruibo Liu, Fabrice Harel-Canada, Nischal Reddy Chandra and Nanyun Peng", "abstract": "We propose EnDex, the first human-reaction based model to evaluate dialogue engagingness. EnDex is trained on 80k Reddit-based Engagement Dataset (RED) curated using a novel distant-supervision framework. Engagingness is a key measure that captures high-level quality of AI dialogue systems and closely reflects actual user experience. However, data shortage, plus the abstract and extensive definition of engagingness makes it challenging to develop an automatic metric. Our work departs from mainstream approaches that use synthetic negative examples to train binary classifiers, and instead, proposes a solution using distant-supervision from human-reaction feedback. \nTo support the soundness of our EnDex metric, we offer a theoretical foundation for engagement, an extensive ablation study, and empirical evidence of high correlation on five engagingness related datasets. We will release code, off-the-shelf EnDex model, and a large-scale dataset upon paper publication to facilitate future research.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.29884147644043, 9.583135604858398], "id": 3238, "title": "LOPS: Learning Order Inspired Pseudo-Label Selection for Weakly Supervised Text Classification", "authors": "Dheeraj Mekala, Chengyu Dong and Jingbo Shang", "abstract": "Weakly supervised text classification methods typically train a deep neural classifier based on pseudo-labels. The quality of pseudo-labels is crucial to final performance but they are inevitably noisy due to their heuristic nature, so selecting the correct ones has a huge potential for performance boost. One straightforward solution is to select samples based on the softmax probability scores in the neural classifier corresponding to their pseudo-labels. However, we show through our experiments that such solutions are ineffective and unstable due to the erroneously high-confidence predictions from poorly calibrated models. Recent studies on the memorization effects of deep neural models suggest that these models first memorize training samples with clean labels and then those with noisy labels. Inspired by this observation, we propose a novel pseudo-label selection method LOPS that takes learning order of samples into consideration. We hypothesize that the learning order reflects the probability of wrong annotation in terms of ranking, and therefore, propose to select the samples that are learnt earlier. LOPS can be viewed as a strong performance-boost plug-in to most existing weakly-supervised text classification methods, as confirmed in extensive experiments on four real-world datasets.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [8.688488960266113, 8.2194185256958], "id": 3241, "title": "Train Flat, Then Compress: Sharpness-Aware Minimization Learns More Compressible Models", "authors": "Clara Na, Sanket Vaibhav Mehta and Emma Strubell", "abstract": "Model compression by way of parameter pruning, quantization, or distillation has recently gained popularity as an approach for reducing the computational requirements of modern deep neural network models for NLP. Inspired by prior works suggesting a connection between simpler, more generalizable models and those that lie within wider loss basins, we hypothesize that optimizing for flat minima should lead to simpler parameterizations and thus more compressible models. We propose to combine sharpness-aware minimization (SAM) with various task-specific model compression methods, including iterative magnitude pruning (IMP), structured pruning with a distillation objective, and post-training dynamic quantization. Empirically, we show that optimizing for flatter minima consistently leads to greater compressibility of parameters compared to vanilla Adam when fine-tuning BERT models, with little to no loss in accuracy on the GLUE text classification and SQuAD question answering benchmarks. Moreover, SAM finds superior winning tickets during IMP that 1) are amenable to vanilla Adam optimization, and 2) transfer more effectively across tasks.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.64055871963501, 9.80152702331543], "id": 3248, "title": "Structural Contrastive Representation Learning for Zero-shot Multi-label Text Classification", "authors": "Tianyi Zhang, Zhaozhuo Xu, Tharun Medini and Anshumali Shrivastava", "abstract": "Zero-shot multi-label text classification (ZMTC) is a fundamental task in natural language processing with applications in the cold start problem of recommendation systems. Ideally, one would learn an expressive representation of both input text and label features so that ZMTC is transformed into a nearest neighbor search problem. However, the existing representation learning approaches for ZMTC struggle with accuracy as well as poor training efficiency. Firstly, the input text is structural, consisting of both short title sentences and long content paragraphs. It is challenging to model the correlation between short label descriptions and long structural input documents. Secondly, the enormous label space in ZMTC forces the existing approaches to perform multi-stage learning with label engineering. As a result, the training overhead is significant. In this paper, we address both problems by introducing an end-to-end structural contrastive representation learning approach. We propose a randomized text segmentation (RTS) technique to generate high-quality contrastive pairs. This RTS technique allows us to model title-content correlation. Additionally, we simplify the multi-stage ZMTC learning strategy by avoiding label engineering. Extensive experiments demonstrate that our approach leads to up to 2.33% improvement in precision@1 and 5.94x speedup in training time on publicly available datasets. Our code is available publicly.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.344463348388672, 8.191740989685059], "id": 3249, "title": "Improving Generalization of Pre-trained Language Models via Stochastic Weight Averaging", "authors": "Peng Lu, Ivan Kobyzev, Mehdi Rezagholizadeh, Ahmad Rashid, Ali Ghodsi and Phillippe Langlais", "abstract": "Knowledge Distillation (KD) is a commonly used technique for improving the generalization of compact Pre-trained Language Models (PLMs) on downstream tasks. However, such methods impose the additional burden of training a separate teacher model for every new dataset.\nAlternatively, one may directly work on the improvement of the optimization procedure of the compact model towards better generalization. Recent works observe that the flatness of the local minimum correlates well with better generalization.\nIn this work, we adapt Stochastic Weight Averaging (SWA), a method encouraging convergence to a flatter minimum, to fine-tuning PLMs. We conduct extensive experiments on various NLP tasks (text classification, question answering, and generation) and different model architectures and demonstrate that our adaptation improves the generalization without extra computation cost. Moreover, we observe that this simple optimization technique is able to outperform the state-of-the-art KD methods for compact models.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.091793060302734, 7.472659111022949], "id": 3263, "title": "Learn What Is Possible, Then Choose What Is Best: Disentangling One-To-Many Relations in Language Through Text-based Games", "authors": "Benjamin Towle and Ke Zhou", "abstract": "Language models pre-trained on large self-supervised corpora, followed by task-specific fine-tuning has become the dominant paradigm in NLP. These pre-training datasets often have a one-to-many structure---e.g. in dialogue there are many valid responses for a given context. However, only some of these responses will be desirable in our downstream task. This raises the question of how we should train the model such that it can emulate the desirable behaviours, but not the undesirable ones. Current approaches train in a one-to-one setup---only a single target response is given for a single dialogue context---leading to models only learning to predict the average response, while ignoring the full range of possible responses. Using text-based games as a testbed, our approach, PASA, uses discrete latent variables to capture the range of different behaviours represented in our larger pre-training dataset. We then use knowledge distillation to distil the posterior probability distribution into a student model. This probability distribution is far richer than learning from only the hard targets of the dataset, and thus allows the student model to benefit from the richer range of actions the teacher model has learned. Results show up to 49\\% empirical improvement over the previous state-of-the-art model on the Jericho Walkthroughs dataset.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.197891712188721, 6.870428562164307], "id": 3268, "title": "Structurally Diverse Sampling for Sample-Efficient Training and Comprehensive Evaluation", "authors": "Shivanshu Gupta, Sameer Singh and Matt Gardner", "abstract": "A growing body of research has demonstrated the inability of NLP models to generalize compositionally and has tried to alleviate it through specialized architectures, training schemes, and data augmentation, among other approaches. In this work, we study a different approach: training on instances with diverse structures. We propose a model-agnostic algorithm for subsampling such sets of instances from a labeled instance pool with structured outputs. Evaluating on both compositional template splits and traditional IID splits of 5 semantic parsing datasets of varying complexity, we show that structurally diverse training using our algorithm leads to comparable or better generalization than prior algorithms in 9 out of 10 dataset-split type pairs. In general, we find structural diversity to consistently improve sample efficiency compared to random train sets. Moreover, we show that structurally diverse sampling yields comprehensive test sets that are a lot more challenging than IID test sets. Finally, we provide two explanations for improved generalization from diverse train sets: 1) improved coverage of output substructures, and 2) a reduction in spurious correlations between these substructures.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.7431910037994385, 9.721627235412598], "id": 3280, "title": "Unsupervised Multi-Granularity Summarization", "authors": "Ming Zhong, Yang Liu, Suyu Ge, Yuning Mao, Yizhu Jiao, Xingxing Zhang, Yichong Xu, Chenguang Zhu, Michael Zeng and Jiawei Han", "abstract": "Text summarization is a user-preference based task, i.e., for one document, users often have different priorities for the summary. As a key aspect of customization in summarization, granularity is used to measure the semantic coverage between the summary and source document. However, developing systems that can generate summaries with customizable semantic coverage is still an under-explored topic. In this paper, we propose the first unsupervised multi-granularity summarization framework, GranuSum. We take events as the basic semantic units of the source documents and propose to rank these events by their salience. We also develop a model to summarize input documents with given events as anchors and hints. By inputting different numbers of events, GranuSum is capable of producing multi-granular summaries in an unsupervised manner. Meanwhile, we annotate a new benchmark GranuDUC that contains multiple summaries at different granularities for each document cluster. Experimental results confirm the substantial superiority of GranuSum on multi-granularity summarization over strong baselines. Furthermore, by exploiting the event information, GranuSum also exhibits state-of-the-art performance under the conventional unsupervised abstractive setting.", "track": "Summarization", "label": 14}, {"loc": [4.186501502990723, 7.724478721618652], "id": 3287, "title": "HeLo: Learning-Free Lookahead Decoding for Conversation Infilling", "authors": "Ivan Lee and Taylor Berg-Kirkpatrick", "abstract": "We propose Heuristic Guided Lookahead Decoding (HeLo), a novel decoding strategy for conversation infilling. Conversation infilling aims to generate a seamless bridge of utterances connecting a given pair of source and target utterances. HeLo does not require fine-tuning or extra models -- only the generating model itself. Instead, HeLo leverages a greedy lookahead phase before committing to any token. The HeLo framework is simple and can augment conventional decoding strategies paired with any autoregressive language model. Smooth transitions between utterances are encouraged with an annealing schedule. Our experiments show HeLo outperforms several baselines when evaluated with both automatic and human evaluation metrics, which, we argue, are appropriate for the task.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.7302057147026062, 7.407279014587402], "id": 3292, "title": "RE-Matching: A Fine-Grained Semantic Matching Method for Zero-Shot Relation Extraction", "authors": "Jun Zhao, WenYu Zhan, Tao Gui, Qi Zhang, Jin Ma and Ying Shan", "abstract": "Semantic matching is a mainstream paradigm of zero-shot relation extraction, which matches a given input with a corresponding label description. \nHowever, the generic matching methods lack explicit modeling of characteristics of relational data. \nIn this work, we propose a fine-grained text semantic matching method tailored to relational data.\nSpecifically, our method consists of an encoding and a matching module. To facilitates efficiency, in the encoding module, we adopt a siamese scheme to separately encode the input and description. To make full use of the characteristics of relational data, in the matching module, we refine the similarity score between input and label descriptions into entity matching score and context matching score. Considering that not all contextual words contribute equally to the relation semantics, we design a context distillation module to filter irrelevant context information to improve context matching. \nExperimental results show that our method achieves higher matching accuracy and more than 10 times faster inference speed, compared with the state-of-the-art methods.", "track": "Information Extraction", "label": 5}, {"loc": [8.217975616455078, 5.80077600479126], "id": 3295, "title": "A Multilingual Generative Transformer for Semantic Sentence Embedding", "authors": "John Wieting, William Cohen, Graham Neubig and Taylor Berg-Kirkpatrick", "abstract": "Contrastive learning has been successfully used for retrieval of semantically aligned sentences, but it often requires large batch sizes or careful engineering to work well. In this paper, we instead propose a generative model for learning multilingual text embeddings which can be also be used to retrieve or score sentence pairs. Our model operates on parallel data in N languages and efficiently encourages source separation in this multilingual setting, separating semantic information that is shared between translations from stylistic or language-specific variation. We evaluate this method on a suite of tasks including semantic similarity, bitext mining, and cross-lingual question retrieval - the last of which we introduce in this paper. Overall, our model outperforms both a strong contrastive and generative baseline on these tasks.", "track": "Multilinguality", "label": 13}, {"loc": [8.106181144714355, 3.111661911010742], "id": 3296, "title": "Invernet: An Inversion Attack Framework to Infer Fine-Tuning Datasets through Word Embeddings", "authors": "Ishrak Hayet, Zijun Yao and Bo Luo", "abstract": "Word embedding aims to learn the dense representation of words and has become a regular input preparation in many NLP tasks. Due to the data and computation intensive nature of learning embeddings from scratch, a more affordable way is to borrow the pretrained embedding available in public and fine-tune the embedding through a domain specific downstream dataset. A privacy concern can arise if a malicious owner of the pretrained embedding gets access to the fine-tuned embedding and tries to infer the critical information from the downstream datasets. In this study, we propose a novel embedding inversion framework called Invernet that materializes the privacy concern by inferring the context distribution in the downstream dataset, which can lead to key information breach. With extensive experimental studies on two real-world news datasets: Antonio Gulli's News and New York Times, we validate the feasibility of proposed privacy attack and demonstrate the effectiveness of Invernet on inferring downstream datasets based on multiple word embedding methods.", "track": "NLP Applications", "label": 0}, {"loc": [4.773763179779053, 4.6390485763549805], "id": 3336, "title": "LawngNLI: A Long-Premise Benchmark for In-Domain Generalization from Short to Long Contexts and for Implication-Based Retrieval", "authors": "William Bruno and Dan Roth", "abstract": "Natural language inference has trended toward studying contexts beyond the sentence level. An important application area is law: past cases often do not foretell how they apply to new situations and implications must be inferred. This paper introduces LawngNLI, constructed from U.S. legal opinions with automatic labels with high human-validated accuracy. Premises are long and multigranular. Experiments show two use cases. First, LawngNLI can benchmark for in-domain generalization from short to long contexts. It has remained unclear if large-scale long-premise NLI datasets actually need to be constructed: near-top performance on long premises could be achievable by fine-tuning using short premises. Without multigranularity, benchmarks cannot distinguish lack of fine-tuning on long premises versus domain shift between short and long datasets. In contrast, our long and short premises share the same examples and domain. Models fine-tuned using several past NLI datasets and/or our short premises fall short of top performance on our long premises. So for at least certain domains (such as ours), large-scale long-premise datasets are needed. Second, LawngNLI can benchmark for implication-based retrieval. Queries are entailed or contradicted by target documents, allowing users to move between arguments and evidence. Leading retrieval models perform reasonably zero shot on a LawngNLI-derived retrieval task. We compare different systems for re-ranking, including lexical overlap and cross-encoders fine-tuned using a modified LawngNLI or past NLI datasets. LawngNLI can train and test systems for implication-based case retrieval and argumentation.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.108076095581055, 3.051406145095825], "id": 3356, "title": "Distillation-Resistant Watermarking for Model Protection in NLP", "authors": "Xuandong Zhao, Lei Li and Yu-Xiang Wang", "abstract": "How can we protect the intellectual property of trained NLP models? Modern NLP models are prone to stealing by querying and distilling from their publicly exposed APIs. However, existing protection methods such as watermarking only work for images but are not applicable to text. We propose Distillation-Resistant Watermarking (DRW), a novel technique to protect NLP models from being stolen via distillation. DRW protects a model by injecting watermarks into the victim's prediction probability corresponding to a secret key and is able to detect such a key by probing a suspect model. We prove that a protected model still retains the original accuracy within a certain bound. We evaluate DRW on a diverse set of NLP tasks including text classification, part-of-speech tagging, and named entity recognition. Experiments show that DRW protects the original model and detects stealing suspects at 100% mean average precision for all four tasks while the prior method fails on two.", "track": "Ethics", "label": 21}, {"loc": [5.919833660125732, 5.08413553237915], "id": 3358, "title": "NeuroCounterfactuals: Beyond Minimal-Edit Counterfactuals for Richer Data Augmentation", "authors": "Phillip Howard, Gadi Singer, Vasudev Lal, Yejin Choi and Swabha Swayamdipta", "abstract": "While counterfactual data augmentation offers a promising step towards robust generalization in natural language processing, producing a set of counterfactuals that offer valuable inductive bias for models remains a challenge. Most existing approaches for producing counterfactuals, manual or automated, rely on small perturbations via minimal edits, resulting in simplistic changes. We introduce NeuroCounterfactuals, designed as loose counterfactuals, allowing for larger edits which result in naturalistic generations containing linguistic diversity, while still bearing similarity to the original document. Our novel generative approach bridges the benefits of constrained decoding, with those of language model adaptation for sentiment steering. Training data augmentation with our generations results in both in-domain and out-of-domain improvements for sentiment classification, outperforming even manually curated counterfactuals, under select settings. We further present detailed analyses to show the advantages of NeuroCounterfactuals over approaches involving simple, minimal edits.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.196130752563477, 5.418179512023926], "id": 3364, "title": "Don't Just Clean It, Proxy Clean It: Mitigating Bias by Proxy in Pre-Trained Models", "authors": "Swetasudha Panda, Ari Kobren, Michael Wick and Qinlan Shen", "abstract": "Transformer-based pre-trained models are known to encode societal biases not only in their contextual representations, but also in downstream predictions when fine-tuned on task-specific data.\nWe present D-Bias, an approach that selectively eliminates stereotypical associations (e.g, co-occurrence statistics) at fine-tuning, such that the model doesn't learn to excessively rely on those signals.\nD-Bias attenuates biases from both identity words and frequently co-occurring proxies, which we select using pointwise mutual information.\nWe apply D-Bias to a) occupation classification, and b) toxicity classification and find that our approach substantially reduces downstream biases (e.g. by > 60% in toxicity classification, for identities that are most frequently flagged as toxic on online platforms).\nIn addition, we show that D-Bias dramatically improves upon scrubbing, i.e., removing only the identity words in question.\nWe also demonstrate that D-Bias easily extends to multiple identities, and achieves competitive performance with two recently proposed debiasing approaches: R-LACE and INLP.", "track": "Ethics", "label": 21}, {"loc": [6.447549343109131, 5.709017276763916], "id": 3367, "title": "The Undesirable Dependence on Frequency of Gender Bias Metrics Based on Word Embeddings", "authors": "Francisco Valentini, Germ\u00e1n Rosati, Diego Fernandez Slezak and Edgar Altszyler", "abstract": "Numerous works use word embedding-based metrics to quantify societal biases and stereotypes in texts. Recent studies have found that word embeddings can capture semantic similarity but may be affected by word frequency. In this work we study the effect of frequency when measuring female vs. male gender bias with word embedding-based bias quantification methods. We find that Skip-gram with negative sampling and GloVe tend to detect male bias in high frequency words, while GloVe tends to return female bias in low frequency words. We show these behaviors still exist when words are randomly shuffled. This proves that the frequency-based effect observed in unshuffled corpora stems from properties of the metric rather than from word associations. The effect is spurious and problematic since bias metrics should depend exclusively on word co-occurrences and not individual word frequencies. Finally, we compare these results with the ones obtained with an alternative metric based on Pointwise Mutual Information. We find that this metric does not show a clear dependence on frequency, even though it is slightly skewed towards male bias across all frequencies.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.813739776611328, 4.814473628997803], "id": 3369, "title": "BioNLI: Generating a Biomedical NLI Dataset Using Lexico-semantic Constraints for Adversarial Examples", "authors": "Mohaddeseh Bastan, Mihai Surdeanu and Niranjan Balasubramanian", "abstract": "Natural language inference (NLI) is critical in many domains requiring complex decision-making, such as the biomedical domain. We introduce a novel semi-supervised procedure that bootstraps biomedical NLI datasets from positive entailment examples present in abstracts of biomedical publications. We focus on challenging texts where the hypothesis includes mechanistic information such as biochemical interactions between two entities. A key contribution of this work is automating the creation of negative examples that are informative without being simplistic. We generate a range of negative examples using nine strategies that manipulate the structure of the underlying mechanisms both with rules, e.g., flip the roles of the entities in the interaction, and, more importantly, by imposing the perturbed conditions as logical constraints in a neuro-logical decoding system \\cite{lu-etal-2021-neurologic}.\nWe use this procedure to create a novel dataset for NLI in the biomedical domain, called \\dataset. The accuracy of neural classifiers on this dataset is in the mid 70s F1, which indicates that this NLI task remains to be solved. Critically, we observe that the performance on the different classes of negative examples varies widely, from 97\\% F1 on the simple negative examples that change the role of the entities in the hypothesis, to barely better than chance on the negative examples generated using neuro-logic decoding.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.573816299438477, 11.990452766418457], "id": 3377, "title": "Self-supervised Cross-modal Pretraining for Speech Emotion Recognition and Sentiment Analysis", "authors": "Iek-Heng Chu, Ziyi Chen, Xinlu Yu, Mei Han, Jing Xiao and Peng Chang", "abstract": "Multimodal speech emotion recognition (SER) and sentiment analysis (SA) are important techniques for human-computer interaction. Most existing multimodal approaches utilize either shallow cross-modal fusion of pretrained features, or deep cross-modal fusion with raw features. Recently, attempts have been made to fuse pretrained feature representations in a deep fusion manner during fine-tuning stage. However those approaches have not led to improved results, partially due to their relatively simple fusion mechanisms and lack of proper cross-modal pretraining. In this work, leveraging single-modal pretrained models (RoBERTa and HuBERT), we propose a novel deeply-fused audio-text bi-modal transformer with carefully designed cross-modal fusion mechanism and a stage-wise cross-modal pretraining scheme to fully facilitate the cross-modal learning. Our experiment results show that the proposed method achieves state-of-the-art results on the public IEMOCAP emotion and CMU-MOSEI sentiment datasets, exceeding the previous benchmarks by a large margin.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.518058776855469, 12.351614952087402], "id": 3383, "title": "Multimodal Conversation Modelling for Topic Derailment Detection", "authors": "Zhenhao Li, Marek Rei and Lucia Specia", "abstract": "Conversations on social media tend to go off-topic and turn into different and sometimes toxic exchanges. Previous work focuses on analysing textual dialogues that have derailed into toxic content, but the range of derailment types is much broader, including spam or bot content, tangential comments, etc. In addition, existing work disregards conversations that involve visual information (i.e. images or videos), which are prevalent on most platforms. In this paper, we take a broader view of conversation derailment and propose a new challenge: detecting derailment based on the \"change of conversation topic\", where the topic is defined by an initial post containing both a text and an image. For that, we (i) create the first Multimodal Conversation Derailment (MCD) dataset, and (ii) introduce a new multimodal conversational architecture (MMConv) that utilises visual and conversational contexts to classify comments for derailment. Experiments show that MMConv substantially outperforms previous text-based approaches to detect conversation derailment, as well as general multimodal classifiers. MMConv is also more robust to textual noise, since it relies on richer contextual information.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.7512354850769043, 9.798709869384766], "id": 3387, "title": "Active Learning for Abstractive Text Summarization", "authors": "Akim Tsvigun, Ivan Lysenko, Danila Sedashov, Ivan Lazichny, Eldar Damirov, Vladimir Alexeyevich Karlov, Artemy V. Belousov, Leonid Sanochkin, Maxim Panov, Alexander Panchenko, Mikhail Burtsev and Artem Shelmanov", "abstract": "Construction of human-curated annotated datasets for abstractive text summarization (ATS) is very time-consuming and expensive because creating each instance requires a human annotator to read a long document and compose a shorter summary that would preserve the key information relayed by the original document. Active Learning (AL) is a technique developed to reduce the amount of annotation required to achieve a certain level of machine learning model performance. In information extraction and text classification, AL can reduce the amount of labor up to multiple times. Despite its potential for aiding expensive annotation, as far as we know, there were no effective AL query strategies for ATS. This stems from the fact that many AL strategies rely on uncertainty estimation, while as we show in our work, uncertain instances are usually noisy, and selecting them can degrade the model performance compared to passive annotation. We address this problem by proposing the first effective query strategy for AL in ATS based on diversity principles. We show that given a certain annotation budget, using our strategy in AL annotation helps to improve the model performance in terms of ROUGE and consistency scores. Additionally, we analyze the effect of self-learning and show that it can additionally increase the performance of the model.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.059288024902344, 8.842109680175781], "id": 3388, "title": "Finding Memo: Extractive Memorization in Constrained Sequence Generation Tasks", "authors": "Vikas Raunak and Arul Menezes", "abstract": "Memorization presents a challenge for several constrained Natural Language Generation (NLG) tasks such as Neural Machine Translation (NMT), wherein the proclivity of neural models to memorize noisy and atypical samples reacts adversely with the noisy (web crawled) datasets. However, previous studies of memorization in constrained NLG tasks have only focused on counterfactual memorization, linking it to the problem of hallucinations. In this work, we propose a new, inexpensive algorithm for extractive memorization (exact training data generation under insufficient context) in constrained sequence generation tasks and use it to study extractive memorization and its effects in NMT. We demonstrate that extractive memorization poses a serious threat to NMT reliability by qualitatively and quantitatively characterizing the memorized samples as well as the model behavior in their vicinity. Based on empirical observations, we develop a simple algorithm which elicits non-memorized translations of memorized samples from the same model, for a large fraction of such samples. Finally, we show that the proposed algorithm could also be leveraged to mitigate memorization in the model through finetuning. We have released the code to reproduce our results at https://github.com/vyraun/Finding-Memo.", "track": "Machine Translation", "label": 10}, {"loc": [10.930545806884766, 6.814829349517822], "id": 3392, "title": "SALTED: A Framework for SAlient Long-tail Translation Error Detection", "authors": "Vikas Raunak, Matt Post and Arul Menezes", "abstract": "Traditional machine translation (MT) metrics provide an average measure of translation quality that is insensitive to the long tail of behavioral problems. Examples include translation of numbers, physical units, dropped content and hallucinations. These errors, which occur rarely and unpredictably in Neural Machine Translation (NMT), greatly undermine the reliability of state-of-the-art MT systems. Consequently, it is important to have visibility into these problems during model development.\nTowards this end, we introduce SALTED, a specifications-based framework for behavioral testing of NMT models. At the core of our approach is the use of high-precision detectors that flag errors (or alternatively, verify output correctness) between a source sentence and a system output. These detectors provide fine-grained measurements of long-tail errors, providing a trustworthy view of problems that were previously invisible. We demonstrate that such detectors could be used not just to identify salient long-tail errors in MT systems, but also for higher-recall filtering of the training data, fixing targeted errors with model fine-tuning in NMT and generating novel data for metamorphic testing to elicit further bugs in models.", "track": "Machine Translation", "label": 10}, {"loc": [3.2614967823028564, 4.833678722381592], "id": 3397, "title": "Discord Questions: A Computational Approach To Diversity Analysis in News Coverage", "authors": "Philippe Laban, Chien-Sheng Wu, Lidiya Murakhovs'ka, Xiang Chen and Caiming Xiong", "abstract": "There are many potential benefits to news readers accessing diverse sources. Modern news aggregators do the hard work of organizing the news, offering readers a plethora of source options, but choosing which source to read remains challenging.\nWe propose a new framework to assist readers in identifying source differences and gaining an understanding of news coverage diversity.\nThe framework is based on the generation of Discord Questions: questions with a diverse answer pool, explicitly illustrating source differences.\nTo assemble a prototype of the framework, we focus on two components: (1) discord question generation, the task of generating questions answered differently by sources, for which we propose an automatic scoring method, and create a model that improves performance from current question generation (QG) methods by 5%, (2) answer consolidation, the task of grouping answers to a question that are semantically similar, for which we collect data and repurpose a method that achieves 81% balanced accuracy on our realistic test set.\nWe illustrate the framework's feasibility through a prototype interface. Even though model performance at discord QG still lags human performance by more than 15%, generated questions are judged to be more interesting than factoid questions and can reveal differences in the level of detail, sentiment, and reasoning of sources in news coverage. Code is available at https://github.com/Salesforce/discord_questions.", "track": "NLP Applications", "label": 0}, {"loc": [6.230443477630615, 5.289445877075195], "id": 3398, "title": "Adversarial Multi-task Training for Debiased Toxicity Detection", "authors": "Zhenhao Li, Marina Fomicheva, Ozan Caglayan and Lucia Specia", "abstract": "Current toxicity detection models tend to overfit on specific terms such as profanity words or group identity mentions, among others. This results in unintended biases in predictions, such as models predicting all occurrences of \"black people\" as toxic. Previous work on mitigating biases in toxicity detection focuses on improving models via techniques such as regularisation, data sampling, and ensembling. In this paper, we propose a new method where potential biases are modelled as additional tasks. Our multi-task learning approach extracts representations that share relevant information across tasks, and relies on adversarial learning to disentangle representations that are task-specific and could lead to biases. Our experiments on three potential biases -- sentiment, profanity, and identity mentions -- show that our method successfully mitigates unintended biases and improves toxicity detection performance on out-of-domain settings while also improving or maintaining performance on in-domain settings.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [3.089956045150757, 4.662405490875244], "id": 3406, "title": "FocusQA: Open-Domain Question Answering with a Context in Focus", "authors": "Gianni Barlacchi, Ivano Lauriola, Alessandro Moschitti, Marco Del Tredici, Xiaoyu Shen, Thuy Vu, Bill Byrne and Adri\u00e0 de Gispert", "abstract": "We introduce question answering with a cotext in focus, a task that simulates a free interaction with a QA system. The user reads on a screen some information about a topic, and they can follow-up with questions that can be either related or not to the topic; and the answer can be found in the document containing the screen content or from other pages. We call such information context. To study the task, we construct FocusQA, a dataset for answer sentence selection (AS2) with 12,165011\nunique question/context pairs, and a total of 109,940 answers. To build the dataset, we developed a novel methodology that takes existing questions and pairs them with relevant contexts. To show the benefits of this approach, we present a comparative analysis with a set of questions written by humans after reading the context, showing that our approach greatly helps in eliciting more realistic question/context pairs. Finally, we show that the task poses several challenges for incorporating contextual information. In this respect, we introduce strong baselines for answer sentence selection that outperform the precision of state-of-the-art models for AS2 up to 21.3% absolute points.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.950631618499756, 5.908518314361572], "id": 3413, "title": "Challenges and Opportunities in Information Manipulation Detection: An Examination of Wartime Russian Media", "authors": "Chan Young Park, Julia Mendelsohn, Anjalie Field and Yulia Tsvetkov", "abstract": "NLP research on public opinion manipulation campaigns has primarily focused on detecting overt strategies such as fake news and disinformation. However, information manipulation in the ongoing Russia-Ukraine war exemplifies how governments and media also employ more nuanced strategies. We release a new dataset, VoynaSlov, containing 38M+ posts from Russian media outlets on Twitter and VKontakte, as well as public activity and responses, immediately preceding and during the 2022 Russia-Ukraine war. We apply standard and recently-developed NLP models on VoynaSlov to examine agenda setting, framing, and priming, several strategies underlying information manipulation, and reveal variation across media outlet control, social media platform, and time. Our examination of these media effects and extensive discussion of current approaches' limitations encourage further development of NLP models for understanding information manipulation in emerging crises, as well as other real-world and interdisciplinary tasks.", "track": "Ethic Concerns:Theme Track", "label": 18}, {"loc": [8.032393455505371, 9.707032203674316], "id": 3417, "title": "Disentangling Task Relations for Few-shot Text Classification via Self-Supervised Hierarchical Task Clustering", "authors": "Juan Zha, Zheng Li, Ying Wei and Yu Zhang", "abstract": "Few-Shot Text Classification (FSTC) imitates humans to learn a new text classifier efficiently with only few examples, by leveraging prior knowledge from historical tasks. However, most prior works assume that all the tasks are sampled from a single data source, which cannot adapt to real-world scenarios where tasks are heterogeneous and lie in different distributions. As such, existing methods may suffer from their globally knowledge-shared mechanisms to handle the task heterogeneity. On the other hand, inherent task relationships are not explicitly captured, making task knowledge unorganized and hard to transfer to new tasks. Thus, we explore a new FSTC setting where tasks can come from a diverse range of data sources. To address the task heterogeneity, we propose a self-supervised hierarchical task clustering (SS-HTC) method. SS-HTC not only customizes the cluster-specific knowledge by dynamically organizing heterogeneous tasks into different clusters in hierarchical levels but also disentangles the underlying relations between tasks to improve the interpretability. Empirically, extensive experiments on five public FSTC benchmark datasets demonstrate the effectiveness of SS-HTC.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.3641283512115479, 4.940248966217041], "id": 3427, "title": "XRICL: Cross-lingual Retrieval-Augmented In-Context Learning for Cross-lingual Text-to-SQL Semantic Parsing", "authors": "Peng Shi, Rui Zhang, He Bai and Jimmy Lin", "abstract": "In-context learning using large language models has recently shown surprising results for semantic parsing tasks such as Text-to-SQL translation.\nPrompting GPT-3 or Codex using several examples of question-SQL pairs can produce excellent results, comparable to state-of-the-art finetuning-based models.\nHowever, existing work primarily focuses on English datasets, and it is unknown whether large language models can serve as competitive semantic parsers for other languages.\nTo bridge this gap, our work focuses on cross-lingual Text-to-SQL semantic parsing for translating non-English utterances into SQL queries based on an English schema.\nWe consider a zero-shot transfer learning setting with the assumption that we do not have any labeled examples in the target language (but have annotated examples in English).\nThis work introduces the XRICL framework, which learns to retrieve relevant English exemplars for a given query to construct prompts.\nWe also include global translation exemplars for a target language to facilitate the translation process for large language models.\nTo systematically evaluate our model, we construct two new benchmark datasets, XSpider and XKaggle-dbqa, which include questions in Chinese, Vietnamese, Farsi, and Hindi.\nOur experiments show that XRICL effectively leverages large pre-trained language models to outperform existing baselines.\nData and code are publicly available at https://github.com/Impavidity/XRICL.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.818761825561523, 7.932519912719727], "id": 3432, "title": "Continuation KD: Improved Knowledge Distillation through the Lens of Continuation Optimization", "authors": "Aref Jafari, Ivan Kobyzev, Mehdi Rezagholizadeh, Pascal Poupart and Ali Ghodsi", "abstract": "Knowledge Distillation (KD) has been extensively used for natural language understanding (NLU) tasks to improve a small model's (a student) generalization by transferring the knowledge from a larger model (a teacher). Although KD methods achieve state-of-the-art performance in numerous settings, they suffer from several problems limiting their performance. It is shown in the literature that the capacity gap between the teacher and the student networks can make KD ineffective. Additionally, existing KD techniques do not mitigate the noise in the teacher's output: modeling the noisy behaviour of the teacher can distract the student from learning more useful features. We propose a new KD method that addresses these problems and facilitates the training compared to previous techniques. Inspired by continuation optimization, we design a training procedure that optimizes the highly non-convex KD objective by starting with the smoothed version of this objective and making it more complex as the training proceeds. Our method (Continuation-KD) achieves state-of-the-art performance across various compact architectures on NLU (GLUE benchmark) and computer vision tasks (CIFAR-10 and CIFAR-100).", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [5.686916828155518, 6.2741570472717285], "id": 3435, "title": "Detecting Dementia from Long Neuropsychological Interviews", "authors": "Nauman Dawalatabad, Yuan Gong, Sameer Khurana, Rhoda Au and James Glass", "abstract": "Neuropsychological exams are commonly used to diagnose various kinds of cognitive impairment. They typically involve a trained examiner who conducts a series of cognitive tests with a subject. In recent years, there has been growing interest in developing machine learning methods to extract speech and language biomarkers from exam recordings to provide automated input for cognitive assessment. Inspired by recent findings suggesting that the examiner's language can influence cognitive impairment classifications, in this paper, we study the influence of the examiner on automatic dementia identification decisions in real-world neuropsychological exams. To mitigate the influence of the examiner, we propose a systematic three-stage pipeline for detecting dementia from exam recordings. In the first stage, we perform audio-based speaker diarization (i.e., estimating who spoke when?) by incorporating speaker discriminative features. In the second stage, we employ text-based language models to identify the role of the speaker (i.e., examiner or subject). Finally, in the third stage, we employ text- and audio-based models to detect cognitive impairment from hypothesized subject segments. Our studies suggest that incorporating audio-based diarization followed by text-based role identification helps mitigate the influences from the examiner's segments. Further, we found that the text and audio modalities complement each other, and the performance improves when we use both modalities. We also perform several carefully designed experimental studies to assess the performance of each stage.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.780162334442139, 8.679004669189453], "id": 3438, "title": "Sarcasm Detection is Way Too Easy! An Empirical Comparison of Human and Machine Sarcasm Detection", "authors": "Ibrahim Abu Farha, Steven R. Wilson, Silviu Oprea and Walid Magdy", "abstract": "Recently, author-annotated sarcasm datasets, which focus on intended, rather than perceived sarcasm, have been introduced. Although datasets collected using first-party annotation have important benefits, there is no comparison of human and machine performance on these new datasets. In this paper, we collect new annotations to provide human-level benchmarks for these first-party annotated sarcasm tasks in both English and Arabic, and compare the performance of human annotators to that of state-of-the-art sarcasm detection systems. Our analysis confirms that sarcasm detection is extremely challenging, with individual humans performing close to or slightly worse than the best trained models. With majority voting, however, humans are able to achieve the best results on all tasks. We also perform error analysis, finding that some of the most challenging examples are those that require additional context. We also highlight common features and patterns used to express sarcasm in English and Arabic such as idioms and proverbs. We suggest that to better capture sarcasm, future sarcasm detection datasets and models should focus on representing conversational and cultural context while leveraging world knowledge and common sense.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [1.3317899703979492, 4.932599067687988], "id": 3442, "title": "Cross-lingual Text-to-SQL Semantic Parsing with Representation Mixup", "authors": "Peng Shi, Linfeng Song, Lifeng Jin, Haitao Mi, He Bai, Jimmy Lin and Dong Yu", "abstract": "We focus on the cross-lingual Text-to-SQL semantic parsing task,\nwhere the parsers are expected to generate SQL for non-English utterances based on English database schemas.\nIntuitively, English translation as side information is an effective way to bridge the language gap,\nbut noise introduced by the translation system may affect parser effectiveness.\nIn this work, we propose a Representation Mixup Framework (Rex) for effectively exploiting translations in the cross-lingual Text-to-SQL task.\nParticularly, it uses a general encoding layer, a transition layer, and a target-centric layer to properly guide the information flow of the English translation.\nExperimental results on CSpider and VSpider show that our framework can benefit from cross-lingual training and improve the effectiveness of semantic parsers, achieving state-of-the-art performance.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.81701946258545, 6.611511707305908], "id": 3445, "title": "JamPatoisNLI: A Jamaican Patois Natural Language Inference Dataset", "authors": "Ruth-Ann Hazel Armstrong, John Hewitt and Christopher D. Manning", "abstract": "JamPatoisNLI provides the first dataset for natural language inference in a creole language, Jamaican Patois.\nMany of the most-spoken low-resource languages are creoles. These languages commonly have a lexicon derived from a major world language and a distinctive grammar reflecting the languages of the original speakers and the process of language birth by creolization. This gives them a distinctive place in exploring the effectiveness of transfer from large monolingual or multilingual pretrained models. While our work, along with previous work, shows that transfer from these models to low-resource languages that are unrelated to languages in their training set is not very effective, we would expect stronger results from transfer to creoles. Indeed, our experiments show considerably better results from few-shot learning of JamPatoisNLI than for such unrelated languages, and help us begin to understand how the unique relationship between creoles and their high-resource base languages affect cross-lingual transfer. JamPatoisNLI, which consists of naturally-occurring premises and expert-written hypotheses, is a step towards steering research into a traditionally underserved language and a useful benchmark for understanding cross-lingual NLP.", "track": "Multilinguality", "label": 13}, {"loc": [6.900683879852295, 9.907169342041016], "id": 3457, "title": "Are Neural Topic Models Broken?", "authors": "Alexander Miserlis Hoyle, Rupak Sarkar, Pranav Goel and Philip Resnik", "abstract": "Recently, the relationship between automated and human evaluation of topic models has been called into question. Method developers have staked the efficacy of new topic model variants on automated measures, and their failure to approximate human preferences places these models on uncertain ground. Moreover, existing evaluation paradigms are often divorced from real-world use.\n\nMotivated by content analysis as a dominant real-world use case for topic modeling, we analyze two related aspects of topic models that affect their effectiveness and trustworthiness in practice for that purpose: the stability of their estimates and the extent to which the model's discovered categories align with human-determined categories in the data. We find that neural topic models fare worse in both respects compared to an established classical method. We take a step toward addressing both issues in tandem by demonstrating that a straightforward ensembling method can reliably outperform the members of the ensemble.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.566172122955322, 7.567196369171143], "id": 3468, "title": "Know Thy Strengths: Comprehensive Dialogue State Tracking Diagnostics", "authors": "Hyundong Cho, Chinnadhurai Sankar, Christopher Lin, Kaushik Ram Sadagopan, Shahin Shayandeh, Asli Celikyilmaz, Jonathan May and Ahmad Beirami", "abstract": "Recent works that revealed the vulnerability of dialogue state tracking (DST) models to distributional shifts have made holistic comparisons on robustness and qualitative analyses increasingly important for understanding their relative performance. \nWe present our findings from standardized and comprehensive DST diagnoses, which have previously been sparse and uncoordinated, using our toolkit, CheckDST, a collection of robustness tests and failure mode analytics. \nWe discover that different classes of DST models have clear strengths and weaknesses, where generation models are more promising for handling language variety while span-based classification models are more robust to unseen entities. \nPrompted by this discovery, we also compare checkpoints from the same model and find that the standard practice of selecting checkpoints using validation loss/accuracy is prone to overfitting and each model class has distinct patterns of failure. \nLastly, we demonstrate how our diagnoses motivate a pre-finetuning procedure with non-dialogue data that offers comprehensive improvements to generation models by alleviating the impact of distributional shifts through transfer learning.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.7146663665771484, 5.347075462341309], "id": 3470, "title": "Open-domain Question Answering via Chain of Reasoning over Heterogeneous Knowledge", "authors": "Kaixin Ma, Hao Cheng, Xiaodong Liu, Eric Nyberg and Jianfeng Gao", "abstract": "We propose a novel open-domain question answering (ODQA) framework for answering single/multi-hop questions across heterogeneous knowledge sources.\nThe key novelty of our method is the introduction of the intermediary modules into the current retriever-reader pipeline.\nUnlike previous methods that solely rely on the retriever for gathering all evidence in isolation,\nour intermediary performs a chain of reasoning over the retrieved set.\nSpecifically, our method links the retrieved evidence with its related global context into graphs and organizes them into a candidate list of evidence chains.\nBuilt upon pretrained language models, our system achieves competitive performance on two ODQA datasets, OTT-QA and NQ, against tables and passages from Wikipedia.\nIn particular, our model substantially outperforms the previous state-of-the-art on OTT-QA with an exact match score of 47.3 (45% relative gain).", "track": "Question Answering", "label": 11}, {"loc": [9.058396339416504, 6.4504241943359375], "id": 3472, "title": "Detecting Languages Unintelligible to Multilingual Models through Local Structure Probes", "authors": "Louis Clouatre, Prasanna Parthasarathi, Amal Zouaq and Sarath Chandar", "abstract": "Providing better language tools for low-resource and endangered languages is imperative for equitable growth.\nRecent progress with massively multilingual pretrained models has proven surprisingly effective at performing zero-shot transfer to a wide variety of languages.\nHowever, this transfer is not universal, with many languages not currently understood by multilingual approaches.\nIt is estimated that only 72 languages possess a \"small set of labeled datasets\" on which we could test a model's performance, the vast majority of languages not having the resources available to simply evaluate performances on.\nIn this work, we attempt to clarify which languages do and do not currently benefit from such transfer.\nTo that end, we develop a general approach that requires only unlabelled text to detect which languages are not well understood by a cross-lingual model.\nOur approach is derived from the hypothesis that if a model's understanding is insensitive to perturbations to text in a language, it is likely to have a limited understanding of that language.\nWe construct a cross-lingual sentence similarity task to evaluate our approach empirically on 350, primarily low-resource, languages.", "track": "Multilinguality", "label": 13}, {"loc": [2.373991012573242, 7.31970739364624], "id": 3481, "title": "Open-Vocabulary Argument Role Prediction For Event Extraction", "authors": "Yizhu Jiao, Sha Li, Yiqing Xie, Ming Zhong, Heng Ji and Jiawei Han", "abstract": "The argument role in event extraction refers to the relation between an event and an argument participating in it. Despite the great progress in event extraction, existing studies still depend on roles pre-defined by domain experts. These studies expose obvious weakness when extending to emerging event types or new domains without available roles. Therefore, more attention and effort needs to be devoted to automatically customizing argument roles. In this paper, we define this essential but under-explored task: \\textbf{open-vocabulary argument role prediction}. The goal of this task is to infer a set of argument roles for a given event type. We propose a novel unsupervised framework, \\textsc{RolePred} for this task. Specifically, we formulate the role prediction problem as an in-filling task and construct prompts for a pre-trained language model to generate candidate roles. By extracting and analyzing the candidate arguments, the event-specific roles are further merged and selected. To standardize the research of this task, we collect a new human-annotated event extraction dataset including 143 customized argument roles with rich semantics. On this dataset, \\textsc{RolePred} outperforms the existing methods by a large margin.", "track": "Information Extraction", "label": 5}, {"loc": [9.52770709991455, 7.464467525482178], "id": 3497, "title": "Token-level Sequence Labeling for Spoken Language Understanding using Compositional End-to-End Models", "authors": "Siddhant Arora, Siddharth Dalmia, Brian Yan, Florian Metze, Alan W Black and Shinji Watanabe", "abstract": "End-to-end spoken language understanding (SLU) systems are gaining popularity over cascaded approaches due to their simplicity and ability to avoid error propagation. However, these systems model sequence labeling as a sequence prediction task causing a divergence from its well-established token-level tagging formulation. We build compositional end-to-end SLU systems that explicitly separate the added complexity of recognizing spoken mentions in SLU from the NLU task of sequence labeling. By relying on intermediate decoders trained for ASR, our end-to-end systems transform the input modality from speech to token-level representations that can be used in the traditional sequence labeling framework. This composition of ASR and NLU formulations in our end-to-end SLU system offers direct compatibility with pre-trained ASR and NLU systems, allows performance monitoring of individual components and enables the use of globally normalized losses like CRF, making them attractive in practical scenarios. Our models outperform both cascaded and direct end-to-end models on a labeling task of named entity recognition across SLU benchmarks.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.73367977142334, 6.533031940460205], "id": 3503, "title": "Baked-in State Probing", "authors": "Shubham Toshniwal, Sam Wiseman, Karen Livescu and Kevin Gimpel", "abstract": "Neural language models have been analyzed for their linguistic and extra-linguistic knowledge via probing. Of particular interest has been the following question: how much can a language model trained only on form learn about meaning? Recent work has demonstrated via probing classifiers that in the setting of simple procedural text, where by \"meaning\" we mean the underlying world state, language models have a non-trivial performance on world state tracking. However, our proposed evaluation based on model predictions shows differing results, suggesting that these models are either not capturing the world state or not using it. How do these results change if the model has access to the world state? We explore this alternate setting with access to the underlying world state only during training and investigate ways of \"baking in\u201d the state knowledge along with the primary task of language modeling. Our proposed approaches allow for state probing during inference simply via text prompts, avoiding any probing classifier machinery. In terms of performance, we show that baking in the state knowledge during training leads to significant improvements in state tracking performance and text generation quality,", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.141451358795166, 12.137025833129883], "id": 3511, "title": "Find Someone Who: Visual Commonsense Understanding in Human-Centric Grounding", "authors": "Haoxuan You, Rui Sun, Zhecan Wang, Kai-Wei Chang and Shih-Fu Chang", "abstract": "From a visual scene containing multiple people, human is able to distinguish each individual given the context descriptions about what happened before, their mental/physical states or intentions, etc. Above ability heavily relies on human-centric commonsense knowledge and reasoning. For example, if asked to identify the \"person who needs healing\" in an image, we need to first know that they usually have injuries or suffering expressions, then find the corresponding visual clues before finally grounding the person. \nWe present a new commonsense task, Human-centric Commonsense Grounding, that tests the models' ability to ground individuals given the context descriptions about what happened before, and their mental/physical states or intentions. We further create a benchmark, HumanCog, a dataset with 130k grounded commonsensical descriptions annotated on 67k images, covering diverse types of commonsense and visual scenes. We set up a context-object-aware method as a strong baseline that outperforms previous pre-trained and non-pretrained models. Further analysis demonstrates that rich visual commonsense and powerful integration of multi-modal commonsense are essential, which sheds light on future works. Data and code will be available at https://github.com/Hxyou/HumanCog.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.871528387069702, 9.226153373718262], "id": 3514, "title": "CrisisLTLSum: A Benchmark for Local Crisis Event Timeline Extraction and Summarization", "authors": "Hossein Rajaby Faghihi, Bashar Alhafni, Ke Zhang, Shihao Ran, Joel Tetreault and Alejandro Jaimes", "abstract": "Social media has increasingly played a key role in emergency response: first responders can use public posts to better react to ongoing crisis events and deploy the necessary resources where they are most needed. Timeline extraction and abstractive summarization are critical technical tasks to leverage large numbers of social media posts about events. Unfortunately, there are few datasets for benchmarking technical approaches for those tasks. This paper presents \\datasetname{}, the largest dataset of local crisis event timelines available to date. \\datasetname{} contains 1,000 crisis event timelines across four domains: wildfires, local fires, traffic, and storms. We built \\datasetname{} using a semi-automated cluster-then-refine approach to collect data from the public Twitter stream. \nOur initial experiments indicate a significant gap between the performance of strong baselines compared to the human performance on both tasks.\nOur dataset, code, and models are publicly available (https://github.com/CrisisLTLSum/CrisisTimelines).", "track": "Resources and Evaluation", "label": 1}, {"loc": [9.164258003234863, 6.983469009399414], "id": 3528, "title": "Prompt-Tuning Can Be Much Better Than Fine-Tuning on Cross-lingual Understanding With Multilingual Language Models", "authors": "Lifu Tu, Caiming Xiong and Yingbo Zhou", "abstract": "Pre-trained multilingual language models show significant performance gains for zero-shot cross-lingual model transfer on a wide range of natural language understanding (NLU) tasks. Previously, for zero-shot cross-lingual evaluation, pre-trained models are only fine-tuned on English data and tested on a variety of target languages. In this paper, we do cross-lingual\nevaluation on various NLU tasks (sentence classification, sequence labeling, question answering) using prompt-tuning and compare it with fine-tuning. The results show that prompt tuning achieves much better cross-lingual transfer than fine-tuning across datasets, with only 0.1% to 0.3% tuned parameters. Additionally, we demonstrate through the analysis that prompt tuning can have better cross-lingual transfer-ability of representations on downstream tasks with better aligned decision boundaries.", "track": "Multilinguality", "label": 13}, {"loc": [9.959933280944824, 7.850012302398682], "id": 3530, "title": "BERT Meets CTC: New Formulation of End-to-End Speech Recognition with Pre-trained Masked Language Model", "authors": "Yosuke Higuchi, Brian Yan, Siddhant Arora, Tetsuji Ogawa, Tetsunori Kobayashi and Shinji Watanabe", "abstract": "This paper presents BERT-CTC, a novel formulation of end-to-end speech recognition that adapts BERT for connectionist temporal classification (CTC). Our formulation relaxes the conditional independence assumptions used in conventional CTC and incorporates linguistic knowledge through the explicit output dependency obtained by BERT contextual embedding. BERT-CTC attends to the full contexts of the input and hypothesized output sequences via the self-attention mechanism. This mechanism encourages a model to learn inner/inter-dependencies between the audio and token representations while maintaining CTC's training efficiency. During inference, BERT-CTC combines a mask-predict algorithm with CTC decoding, which iteratively refines an output sequence. The experimental results reveal that BERT-CTC improves over conventional approaches across variations in speaking styles and languages. Finally, we show that the semantic representations in BERT-CTC are beneficial towards downstream spoken language understanding tasks.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.668966770172119, 9.15731143951416], "id": 3533, "title": "EtriCA: Event-Triggered Context-Aware Story Generation Augmented by Cross Attention", "authors": "Chen Tang, Chenghua Lin, Henglin Huang, Frank Guerin and Zhihao Zhang", "abstract": "One of the key challenges of automatic story generation is how to generate a long narrative that can maintain fluency, relevance, and coherence. Despite recent progress, current story generation systems still face the challenge of how to effectively capture contextual and event features, which has a profound impact on a model's generation performance. To address these challenges, we present EtriCA, a novel neural generation model, which improves the relevance and coherence of the generated stories through residually mapping context features to event sequences with a cross-attention mechanism. Such a feature capturing mechanism allows our model to better exploit the logical relatedness between events when generating stories. Extensive experiments based on both automatic and human evaluations show that our model significantly outperforms state-of-the-art baselines, demonstrating the effectiveness of our model in leveraging context and event features.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.9025468826293945, 9.186478614807129], "id": 3534, "title": "LADIS: Language Disentanglement for 3D Shape Editing", "authors": "Ian Huang, Panos Achlioptas, Tianyi Zhang, Sergei Tulyakov, Minhyuk Sung and Leonidas Guibas", "abstract": "Natural language interaction is a promising direction for democratizing 3D shape design. However, existing methods for text-driven 3D shape editing face challenges in producing decoupled, local edits to 3D shapes. We address this problem by learning disentangled latent representations that ground language in 3D geometry. To this end, we propose a complementary tool set including a novel network architecture, a disentanglement loss, and a new editing procedure. Additionally, to measure edit locality, we define a new metric that we call part-wise edit precision. We show that our method outperforms existing SOTA methods by 20% in terms of edit locality, and up to 6.6% in terms of language reference resolution accuracy. Human evaluations additionally show that compared to the existing SOTA, our method produces shape edits that are more local, more semantically accurate, and more visually obvious. Our work suggests that by solely disentangling language representations, downstream 3D shape editing can become more local to relevant parts, even if the model was never given explicit part-based supervision.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.532156944274902, 7.888370513916016], "id": 3544, "title": "Effective Pretraining Objectives for Transformer-based Autoencoders", "authors": "Luca Di Liello, Matteo Gabburo and Alessandro Moschitti", "abstract": "In this paper, we study trade-offs between efficiency, cost and accuracy when pre-training Transformer encoders with different pre-training objectives. For this purpose, we analyze features of common objectives and combine them to create new effective pre-training approaches. Specifically, we designed light token generators based on a straightforward statistical approach, which can replace ELECTRA computationally heavy generators, thus highly reducing cost. Our experiments also show that (i) there are more efficient alternatives to BERT's MLM, and (ii) it is possible to efficiently pre-train Transformer-based models using lighter generators without a significant drop in performance.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.861241340637207, 7.375240802764893], "id": 3578, "title": "Language Model Detoxification in Dialogue with Contextualized Stance Control", "authors": "Jing Qian and Xifeng Yan", "abstract": "To reduce the toxic degeneration in a pretrained Language Model (LM), previous work on Language Model detoxification has focused on reducing the toxicity of the generation itself (self-toxicity) without consideration of the context. As a result, a type of implicit offensive language where the generations support the offensive language in the context is ignored. Different from the LM controlling tasks in previous work, where the desired attributes are fixed for generation, the desired stance of the generation depends on the offensiveness of the context. Therefore, we propose a novel control method to do context-dependent detoxification with the stance taken into consideration. We introduce meta prefixes to learn the contextualized stance control strategy and to generate the stance control prefix according to the input context. The generated stance prefix is then combined with the toxicity control prefix to guide the response generation. Experimental results show that our proposed method can effectively learn the context-dependent stance control strategies while keeping a low self-toxicity of the underlying LM.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [2.0995357036590576, 7.5966339111328125], "id": 3582, "title": "Multilingual SubEvent Relation Extraction: A Novel Dataset and Structure Induction Method", "authors": "Viet Dac Lai, Hieu Man, Linh Ngo, Franck Dernoncourt and Thien Huu Nguyen", "abstract": "Subevent Relation Extraction (SRE) is a task in Information Extraction that aims to recognize spatial and temporal containment relations between event mentions in text. Recent methods have utilized pre-trained language models to represent input texts for SRE. However, a key issue in existing SRE methods is the employment of sequential order of words in texts to feed into representation learning methods, thus unable to explicitly focus on important context words and their interactions to enhance representations. In this work, we introduce a new method for SRE that learns to induce effective graph structures for input texts to boost representation learning. Our method features a word alignment framework with dependency paths and optimal transport to identify important context words to form effective graph structures for SRE. In addition, to enable SRE research on non-English languages, we present a new multilingual SRE dataset for five typologically different languages. Extensive experiments reveal the state-of-the-art performance for our method on different datasets and languages.", "track": "Information Extraction", "label": 5}, {"loc": [1.6975172758102417, 5.370996475219727], "id": 3585, "title": "A Two-Stage Approach towards Generalization in Knowledge Base Question Answering", "authors": "Srinivas Ravishankar, Dung Thai, Ibrahim Abdelaziz, Nandana Mihindukulasooriya, Tahira Naseem, Pavan Kapanipathi, Gaetano Rossiello and Achille Fokoue", "abstract": "Most existing approaches for Knowledge Base Question Answering (KBQA) focus on a specific underlying knowledge base either because of inherent assumptions in the approach, or because evaluating it on a different knowledge base requires non-trivial changes. However, many popular knowledge bases share similarities in their underlying schemas that can be leveraged to facilitate generalization across knowledge bases. To achieve this generalization, we introduce a KBQA framework based on a 2-stage architecture that explicitly separates semantic parsing from the knowledge base interaction, facilitating transfer learning across datasets and knowledge graphs. We show that pretraining on datasets with a different underlying knowledge base can nevertheless provide significant performance gains and reduce sample complexity. Our approach achieves comparable or state-of-the-art performance for LC-QuAD (DBpedia), WebQSP (Freebase), SimpleQuestions (Wikidata) and MetaQA (Wikimovies-KG).", "track": "Question Answering", "label": 11}, {"loc": [8.001968383789062, 9.687429428100586], "id": 3612, "title": "Few-Shot (Dis)Agreement Identification in Online Discussions with Regularized and Augmented Meta-Learning", "authors": "Yuanyuan Lei and Ruihong Huang", "abstract": "Online discussions are abundant with opinions towards a common topic, and identifying (dis)agreement between a pair of comments enables many opinion mining applications. Realizing the increasing needs to analyze opinions for emergent new topics that however tend to lack annotations, we present the first meta-learning approach for few-shot (dis)agreement identification that can be quickly applied to analyze opinions for new topics with few labeled instances. Furthermore, we enhance the meta-learner's domain generalization ability from two perspectives. The first is domain-invariant regularization, where we design a lexicon-based regularization loss to enable the meta-learner to learn domain-invariant cues. The second is domain-aware augmentation, where we propose domain-aware task augmentation for meta-training to learn domain-specific expressions. In addition to using an existing dataset, we also evaluate our approach on two very recent new topics, mask mandate and COVID vaccine, using our newly annotated datasets containing 1.5k and 1.4k SubReddits comment pairs respectively. Extensive experiments on three domains/topics demonstrate the effectiveness of our meta-learning approach.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [10.710810661315918, 6.927748203277588], "id": 3615, "title": "Data Cartography for Low-Resource Neural Machine Translation", "authors": "Aquia Richburg and Marine Carpuat", "abstract": "While collecting or generating more parallel data is necessary to improve machine translation (MT) in low-resource settings, we lack an understanding of how the limited amounts of existing data are actually used to help guide the collection of further resources. In this paper, we apply data cartography techniques (Swayamdipta et al., 2020) to characterize the contribution of training samples in two low-resource MT tasks (Swahili-English and Turkish-English) throughout the training of standard neural MT models. Our empirical study shows that, unlike in prior work for classification tasks, most samples contribute to model training in low-resource MT, albeit not uniformly throughout the training process. \n Furthermore, uni-dimensional characterizations of samples -- e.g., based on dual cross-entropy or word frequency -- do not suffice to characterize to what degree they are hard or easy to learn. Taken together, our results suggest that data augmentation strategies for low-resource MT would benefit from model-in-the-loop strategies to maximize improvements.", "track": "Machine Translation", "label": 10}, {"loc": [1.3235055208206177, 4.910745143890381], "id": 3645, "title": "Augmenting Multi-Turn Text-to-SQL Datasets with Self-Play", "authors": "Qi Liu, Zihuiwen Ye, Tao Yu, Linfeng Song and Phil Blunsom", "abstract": "The task of context-dependent text-to-SQL aims to convert multi-turn user utterances to formal SQL queries. This is a challenging task due to both the scarcity of training data from which to learn complex contextual dependencies and to generalize to unseen databases. In this paper we explore augmenting the training datasets using self-play, which leverages contextual information to synthesize new interactions to adapt the model to new databases. We first design a SQL-to-text model conditioned on a sampled goal query, which represents a user's intent, that then converses with a text-to-SQL semantic parser to generate new interactions. We then filter the synthesized interactions and retrain the models with the augmented data. We find that self-play improves the accuracy of a strong baseline on SParC and CoSQL, two widely used cross-domain text-to-SQL datasets. Our analysis shows that self-play simulates various conversational thematic relations, enhances cross-domain generalization and improves beam-search.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [5.990258693695068, 8.834150314331055], "id": 3647, "title": "Prompt Compression and Contrastive Conditioning for Controllability and Toxicity Reduction in Language Models", "authors": "David Wingate, Mohammad Shoeybi and Taylor Sorensen", "abstract": "We explore the idea of compressing the prompts used to condition language models, and show that compressed prompts can retain a substantive amount of information about the original prompt. For severely compressed prompts, while fine-grained information is lost, abstract information and general sentiments can be retained with surprisingly few parameters, which can be useful in the context of decode-time algorithms for controllability and toxicity reduction. We find that some complex prompts can be effectively compressed into a single token to guide generation. We also show that compressed prompts are largely compositional, and can be constructed such that they can be used to control independent aspects of generated text.", "track": "Ethic Concerns:Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.579198837280273, 3.7355782985687256], "id": 3664, "title": "NaturalAdversaries: Can Naturalistic Adversaries Be as Effective as Artificial Adversaries?", "authors": "Saadia Gabriel, Hamid Palangi and Yejin Choi", "abstract": "While a substantial body of prior work has explored adversarial example generation for natural language understanding tasks, these examples are often unrealistic and diverge from the real-world data distributions. In this work, we introduce a two-stage adversarial example generation framework (NaturalAdversaries), for designing adversaries that are effective at fooling a given classifier and demonstrate natural-looking failure cases that could plausibly occur during in-the-wild deployment of the models. At the first stage a token attribution method is used to summarize a given classifier's behavior as a function of the key tokens in the input. In the second stage a generative model is conditioned on the key tokens from the first stage. NaturalAdversaries is adaptable to both black-box and white-box adversarial attacks based on the level of access to the model parameters. Our results indicate these adversaries generalize across domains, and offer insights for future research on improving robustness of neural text classification models.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.523860931396484, 7.183385372161865], "id": 3679, "title": "Multi-Path Transformer is Better: A Case Study on Neural Machine Translation", "authors": "Ye Lin, Shuhan Zhou, Yanyang Li, Anxiang Ma, Tong Xiao and Jingbo Zhu", "abstract": "For years the model performance in machine learning obeyed a power-law relationship with the model size. For the consideration of parameter efficiency, recent studies focus on increasing model depth rather than width to achieve better performance. In this paper, we study how model width affects the Transformer model through a parameter-efficient multi-path structure. To better fuse features extracted from different paths, we add three additional operations to each sublayer: a normalization at the end of each path, a cheap operation to produce more features, and a learnable weighted mechanism to fuse all features flexibly. Extensive experiments on 12 WMT machine translation tasks show that, with the same number of parameters, the shallower multi-path model can achieve similar or even better performance than the deeper model. It reveals that we should pay more attention to the multi-path structure, and there should be a balance between the model depth and width to train a better large-scale Transformer.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.853700637817383, 7.779420852661133], "id": 3683, "title": "Unsupervised Learning of Hierarchical Conversation Structure", "authors": "Bo-Ru Lu, Yushi Hu, Hao Cheng, Noah A. Smith and Mari Ostendorf", "abstract": "Human conversations can evolve in many different ways, creating challenges for automatic understanding and summarization. Goal-oriented conversations often have meaningful sub-dialogue structure, but it can be highly domain-dependent. This work introduces an unsupervised approach to learning hierarchical conversation structure, including turn and sub-dialogue segment labels, corresponding roughly to dialogue acts and sub-tasks, respectively. The decoded structure is shown to be useful in enhancing neural models of language for three conversation-level understanding tasks. Further, the learned finite-state sub-dialogue network is made interpretable through automatic summarization.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.6703386306762695, 8.579376220703125], "id": 3685, "title": "Task Compass: Scaling Multi-task Pre-training with Task Prefix", "authors": "Zhuosheng Zhang, Shuohang Wang, Yichong Xu, Yuwei Fang, Wenhao Yu, Yang Liu, Hai Zhao, Chenguang Zhu and Michael Zeng", "abstract": "Leveraging task-aware annotated data as supervised signals to assist with self-supervised learning on large-scale unlabeled data has become a new trend in pre-training language models. Existing studies show that multi-task learning with large-scale supervised tasks suffers from negative effects across tasks. To tackle the challenge, we propose a task prefix guided multi-task pre-training framework to explore the relationships among tasks. We conduct extensive experiments on 40 datasets, which show that our model can not only serve as the strong foundation backbone for a wide range of tasks but also be feasible as a probing tool for analyzing task relationships. The task relationships reflected by the prefixes align transfer learning performance between tasks. They also suggest directions for data augmentation with complementary tasks, which help our model achieve human-parity results on commonsense reasoning leaderboards. Code is available at https://github.com/cooelf/CompassMTL.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.541600704193115, 3.7566184997558594], "id": 3688, "title": "Sharpness-Aware Minimization with Dynamic Reweighting", "authors": "Wenxuan Zhou, Fangyu Liu, Huan Zhang and Muhao Chen", "abstract": "Deep neural networks are often overparameterized and may not easily achieve model generalization. Adversarial training has shown effectiveness in improving generalization by regularizing the change of loss on top of adversarially chosen perturbations. The recently proposed sharpness-aware minimization (SAM) algorithm conducts adversarial weight perturbation, encouraging the model to converge to a flat minima. SAM finds a common adversarial weight perturbation per-batch. Although per-instance adversarial weight perturbations are stronger adversaries and can potentially lead to better generalization performance, their computational cost is very high and thus it is impossible to use per-instance perturbations efficiently in SAM. In this paper, we tackle this efficiency bottleneck and propose sharpness-aware minimization with dynamic reweighting (delta-SAM). Our theoretical analysis motivates that it is possible to approach the stronger, per-instance adversarial weight perturbations using reweighted per-batch weight perturbations. delta-SAM dynamically reweights perturbation within each batch according to the theoretically principled weighting factors, serving as a good approximation to per-instance perturbation. Experiments on various natural language understanding tasks demonstrate the effectiveness of delta-SAM.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.449916839599609, 7.521115779876709], "id": 3693, "title": "Predicting Long-Term Citations from Short-Term Linguistic Influence", "authors": "Sandeep Soni, David Bamman and Jacob Eisenstein", "abstract": "A standard measure of the influence of a research paper is the number of times it is cited. However, papers may be cited for many reasons, and citation count is not informative about the extent to which a paper affected the content of subsequent publications. We therefore propose a novel method to quantify linguistic influence in timestamped document collections. There are two main steps: first, identify lexical and semantic changes using contextual embeddings and word frequencies; second, aggregate information about these changes into per-document influence parameters by estimating a high-dimensional Hawkes process with a low-rank parameter matrix. The resulting measures of linguistic influence are predictive of \\emph{future} citations. Specifically, the estimate of linguistic influence from the two years after a paper's publication is correlated with and predictive of its citation count in the following three years. This is demonstrated using an online evaluation with incremental temporal training/test splits, in comparison with a strong baseline that includes predictors for initial citation counts, topics, and lexical features.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [10.024506568908691, 7.926711082458496], "id": 3697, "title": "Joint Audio/Text Training for Transformer Rescorer of Streaming Speech Recognition", "authors": "Suyoun Kim, Ke Li, Lucas Kabela, Ron Huang, Jiedan Zhu, Ozlem Kalinli and Duc Le", "abstract": "Recently, there has been an increasing interest in two-pass streaming end-to-end speech recognition (ASR) that incorporates a 2nd-pass rescoring model on top of the conventional 1st-pass streaming ASR model to improve recognition accuracy while keeping latency low. One of the latest 2nd-pass rescoring model, Transformer Rescorer, takes the n-best initial outputs and audio embeddings from the 1st-pass model, and then choose the best output by re-scoring the n-best initial outputs. However, training this Transformer Rescorer requires expensive paired audio-text training data because the model uses audio embeddings as input. In this work, we present our Joint Audio/Text training method for Transformer Rescorer, to leverage unpaired text-only data which is relatively cheaper than paired audio-text data. We evaluate Transformer Rescorer with our Joint Audio/Text training on Librispeech dataset as well as our large-scale in-house dataset and show that our training method can improve word error rate (WER) significantly compared to standard Transformer Rescorer without requiring any extra model parameters or latency.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.501277446746826, 7.108351230621338], "id": 3700, "title": "TyDiP: A Dataset for Politeness Classification in Nine Typologically Diverse Languages", "authors": "Anirudh Srinivasan and Eunsol Choi", "abstract": "We study politeness phenomena in nine typologically diverse languages. Politeness is an important facet of communication and is sometimes argued to be cultural-specific, yet existing computational linguistic study is limited to English. We create TyDiP, a dataset containing three-way politeness annotations for 500 examples in each language, totaling 4.5K examples. We evaluate how well multilingual models can identify politeness levels -- they show a fairly robust zero-shot transfer ability, yet fall short of estimated human accuracy significantly. We further study mapping the English politeness strategy lexicon into nine languages via automatic translation and lexicon induction, analyzing whether each strategy's impact stays consistent across languages. Lastly, we empirically study the complicated relationship between formality and politeness through transfer experiments. We hope our dataset will support various research questions and applications, from evaluating multilingual models to constructing polite multilingual agents.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.493197441101074, 12.417745590209961], "id": 3714, "title": "Probing Cross-modal Semantics Alignment Capability from the Textual Perspective", "authors": "Zheng Ma, Shi Zong, Mianzhi Pan, Jianbing Zhang, Shujian Huang, Xinyu Dai and Jiajun CHEN", "abstract": "In recent years, vision and language pre-training (VLP) models have advanced the state-of-the-art results in a variety of cross-modal downstream tasks. Aligning cross-modal semantics is claimed to be one of the essential capabilities of VLP models. However, it still remains unclear about the inner working mechanism of alignment in VLP models. In this paper, we propose a new probing method that is based on image captioning to first empirically study the cross-modal semantics alignment of VLP models. Our probing method is built upon the fact that given an image-caption pair, the VLP models will give a score, indicating how well two modalities are aligned; maximizing such scores will generate sentences that VLP models believe are of good alignment. Analyzing these sentences thus will reveal in what way different modalities are aligned and how well these alignments are in VLP models. We apply our probing method to five popular VLP models, including UNITER, ROSITA, ViLBERT, CLIP, and LXMERT, and provide a comprehensive analysis of the generated captions guided by these models. Our results show that VLP models (1) focus more on just aligning objects with visual words, while neglecting global semantics; (2) prefer fixed sentence patterns, thus ignoring more important textual information including fluency and grammar; and (3) deem the captions with more visual words are better aligned with images. These findings indicate that VLP models still have weaknesses in cross-modal semantics alignment and we hope this work will draw researchers' attention to such problems when designing a new VLP model.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.691482543945312, 8.38038444519043], "id": 3715, "title": "Hidden State Variability of Pretrained Language Models Can Guide Computation Reduction for Transfer Learning", "authors": "Shuo Xie, Jiahao Qiu, Ankita Pasad, Li Du, Qing Qu and Hongyuan Mei", "abstract": "While transferring a pretrained language model, common approaches conventionally attach their task-specific classifiers to the top layer and adapt all the pretrained layers. We investigate whether one could make a task-specific selection on which subset of the layers to adapt and where to place the classifier. The goal is to reduce the computation cost of transfer learning methods (e.g. fine-tuning or adapter-tuning) without sacrificing its performance.\n\nWe propose to select layers based on the variability of their hidden states given a task-specific corpus. \nWe say a layer is already ``well-specialized'' in a task if the within-class variability of its hidden states is low relative to the between-class variability. Our variability metric is cheap to compute and doesn't need any training or hyperparameter tuning. It is robust to data imbalance and data scarcity. Extensive experiments on the GLUE benchmark demonstrate that selecting layers based on our metric can yield significantly stronger performance than using the same number of top layers and often match the performance of fine-tuning or adapter-tuning the entire language model.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.602209091186523, 6.4114837646484375], "id": 3728, "title": "Language Models as Agent Models", "authors": "Jacob Andreas", "abstract": "Language models (LMs) are trained on collections of documents, written by individual human agents to achieve specific goals in the outside world. During training, LMs have access only to text of these documents, with no direct evidence of the internal states of the agents that produced them---a fact often used to argue that LMs are incapable of modeling goal-directed aspects of human language production and comprehension. Can LMs trained on text learn anything at all about the relationship between language and use? I argue that LMs are models of communicative intentions in a specific, narrow sense. When performing next word prediction given a textual context, an LM can infer and represent properties of an agent likely to have produced that context. These representations can in turn influence subsequent LM generation in the same way that agents' communicative intentions influence their language. I survey findings from the recent literature showing that---even in today's non-robust and error-prone models---LMs infer and use representations of fine-grained communicative intentions and high-level beliefs and goals. Despite the limited nature of their training data, they can thus serve as building blocks for systems that communicate and act intentionally.", "track": "Theme Track", "label": 18}, {"loc": [0.9188224673271179, 8.126646995544434], "id": 3738, "title": "Combinatory Grammar Tells Underlying Relevance among Entities", "authors": "Yuanhe Tian and Yan Song", "abstract": "Relation extraction (RE) is an important task in natural language processing which aims to annotate the relation between two given entities, which requires a deep understanding of the running text. To import model performance, existing approaches leverage syntactic information to facilitate the relation extraction process, where they mainly focus on dependencies among words while paying limited attention to other types of syntactic structure. Considering that combinatory categorial grammar (CCG) is a lexicalized grammatical formalism that carries the syntactic and semantic knowledge for text understanding, we propose an alternative solution for RE that takes advantage of CCG to detect the relation between entities. In doing so, we perform a multi-task learning process to learn from RE and auto-annotated CCG supertags, where an attention mechanism is performed over all input words to distinguish the important ones for RE with the attention weights guided by the supertag decoding process. We evaluate our model on two widely used English benchmark datasets (i.e., ACE2005EN and SemEval 2010 Task 8 datasets) for RE, where the effectiveness of our approach is demonstrated by the experimental results with our approach achieving state-of-the-art performance on both datasets.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.854565143585205, 6.876981735229492], "id": 3748, "title": "Leveraging Open Data and Task Augmentation to Automated Behavioral Coding of Psychotherapy Conversations in Low-Resource Scenarios", "authors": "Zhuohao Chen, Nikolaos Flemotomos, Zac Imel, David Atkins and Shrikanth Narayanan", "abstract": "In psychotherapy interactions, the quality of a session is assessed by codifying the communicative behaviors of participants during the conversation through manual observation and annotation. Developing computational approaches for automated behavioral coding can reduce the burden on human coders and facilitate the objective evaluation of the intervention. In the real world, however, implementing such algorithms is associated with data sparsity challenges since privacy concerns lead to limited available in-domain data. In this paper, we leverage a publicly available conversation-based dataset and transfer knowledge to the low-resource behavioral coding task by performing an intermediate language model training via meta-learning. We introduce a task augmentation method to produce a large number of ``analogy tasks'' \u2014 tasks similar to the target one \u2014 and demonstrate that the proposed framework predicts target behaviors more accurately than all the other baseline models.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [7.2511725425720215, 9.505478858947754], "id": 3749, "title": "Learning to Detect Noisy Labels Using Model-Based Features", "authors": "Zhihao Wang, Zongyu Lin, Junjie Wen, Xianxin Chen, Peiqi Liu, Guidong Zheng, Yujun Chen and Zhilin Yang", "abstract": "Label noise is ubiquitous in various machine learning scenarios such as self-labeling with model predictions and erroneous data annotation. Many existing approaches are based on heuristics such as sample losses, which might not be flexible enough to achieve optimal solutions. Meta learning based methods address this issue by learning a data selection function, but can be hard to optimize. In light of these pros and cons, we propose SENT (Selection-Enhanced Noisy label Training) that does not rely on meta learning while having the flexibility of being data-driven. SENT transfers the noise distribution to a clean set and trains a model to distinguish noisy labels from clean ones using model-based features. Empirically, on a wide range of tasks including text classification and speech recognition, SENT improves performance over strong baselines under the settings of self-training and label corruption.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.100921154022217, 9.623579978942871], "id": 3753, "title": "Keyphrase Generation Beyond the Boundaries of Title and Abstract", "authors": "Krishna Garg, Jishnu Ray Chowdhury and Cornelia Caragea", "abstract": "Keyphrase generation aims at generating important phrases (keyphrases) that best describe a given document. In scholarly domains, current approaches have largely used only the title and abstract of the articles to generate keyphrases. In this paper, we comprehensively explore whether the integration of additional information from the full text of a given article or from semantically similar articles can be helpful for a neural keyphrase generation model or not. We discover that adding sentences from the full text, particularly in the form of the extractive summary of the article can significantly improve the generation of both types of keyphrases that are either present or absent from the text. Experimental results with three widely used models for keyphrase generation along with one of the latest transformer models suitable for longer documents, Longformer Encoder-Decoder (LED) validate the observation. We also present a new large-scale scholarly dataset FullTextKP for keyphrase generation. Unlike prior large-scale datasets, FullTextKP includes the full text of the articles along with the title and abstract. We release the source code at https://github.com/kgarg8/FullTextKP.", "track": "Information Extraction", "label": 5}, {"loc": [7.322485446929932, 7.006447792053223], "id": 3767, "title": "Composition, Attention, or Both?", "authors": "Ryo Yoshida and Yohei Oseki", "abstract": "In this paper, we propose a novel architecture called Composition Attention Grammars (CAGs) that recursively compose subtrees into a single vector representation with a composition function, and selectively attend to previous structural information with a self-attention mechanism. We investigate whether these components---the composition function and the self-attention mechanism---can both induce human-like syntactic generalization. Specifically, we train language models (LMs) with and without these two components with the model sizes carefully controlled, and evaluate their syntactic generalization performance against six test circuits on the SyntaxGym benchmark. The results demonstrated that the composition function and the self-attention mechanism both play an important role to make LMs more human-like, and closer inspection of linguistic phenomenon implied that the composition function allowed syntactic features, but not semantic features, to percolate into subtree representations.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [6.109938621520996, 8.323406219482422], "id": 3777, "title": "CDGP: Automatic Cloze Distractor Generation based on Pre-trained Language Model", "authors": "Shang-Hsuan Chiang, Ssu-Cheng Wang and Yao-Chung Fan", "abstract": "Manually designing cloze test consumes enormous time and efforts. The major challenge lies in wrong option (distractor) selection. Having carefully-design distractors improves the effectiveness of learner ability assessment. As a result, the idea of automatically generating cloze distractor is motivated. In this paper, we investigate cloze distractor generation by exploring the employment of pre-trained language models (PLMs) as an alternative for candidate distractor generation. Experiments show that the PLM-enhanced model brings a substantial performance improvement. Our best performing model advances the state-of-the-art result from 14.94 to 34.17 (NDCG@10 score). Our code and dataset is available at \nhttps://github.com/AndyChiangSH/CDGP.", "track": "NLP Applications", "label": 0}, {"loc": [5.090182304382324, 12.533550262451172], "id": 3778, "title": "G3: Geolocation via Guidebook Grounding", "authors": "Grace Luo, Giscard Biamby, Trevor Darrell, Daniel Fried and Anna Rohrbach", "abstract": "We demonstrate how language can improve geolocation: the task of predicting the location where an image was taken. Here we study explicit knowledge from human-written guidebooks that describe the salient and class-discriminative visual features humans use for geolocation. We propose the task of Geolocation via Guidebook Grounding that uses a dataset of StreetView images from a diverse set of locations and an associated textual guidebook for GeoGuessr, a popular interactive geolocation game. Our approach predicts a country for each image by attending over the clues automatically extracted from the guidebook. Supervising attention with country-level pseudo labels achieves the best performance. Our approach substantially outperforms a state-of-the-art image-only geolocation method, with an improvement of over 5% in Top-1 accuracy. Our dataset and code can be found at https://github.com/g-luo/geolocation_via_guidebook_grounding.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.026961803436279, 5.405941486358643], "id": 3785, "title": "Controlling Bias Exposure for Fair Interpretable Predictions", "authors": "Zexue He, Yu Wang, Julian McAuley and Bodhisattwa Prasad Majumder", "abstract": "Recent work on reducing bias in NLP models usually focuses on protecting or isolating information related to a sensitive attribute (like gender or race). However, when sensitive information is semantically entangled with the task information of the input, e.g., gender information is predictive for a profession, a fair trade-off between task performance and bias mitigation is difficult to achieve. Existing approaches perform this trade-off by eliminating bias information from the latent space, lacking control over how much bias is necessarily required to be removed. We argue that a favorable debiasing method should use sensitive information \u2018fairly', rather than blindly eliminating it (Caliskan et al., 2017; Sun et al., 2019; Bogen et al., 2020). In this work, we provide a novel debiasing algorithm by adjusting\nthe predictive model's belief to (1) ignore the sensitive information if it is not useful for the task; (2) use sensitive information minimally as necessary for the prediction (while also incurring a penalty). Experimental results on two text classification tasks (influenced by gender) and an open-ended generation task (influenced by race) indicate that our model achieves a desirable trade-off between debiasing and task performance along with producing debiased rationales as evidence.", "track": "Ethics", "label": 21}, {"loc": [5.0633955001831055, 4.9315948486328125], "id": 3821, "title": "Investigating the Benefits of Free-Form Rationales", "authors": "Jiao Sun, Swabha Swayamdipta, Jonathan May and Xuezhe Ma", "abstract": "Free-form rationales aim to aid model interpretability by supplying the background knowledge that can help understand model decisions. Crowdsourced rationales are provided for commonsense QA instances in popular datasets such as CoS-E and ECQA, but their utility remains under-investigated. We present human studies which show that ECQA rationales indeed provide additional background information to understand a decision, while over 88% of CoS-E rationales do not. Inspired by this finding, we ask: can the additional context provided by free-form rationales benefit models, similar to human users? We investigate the utility of rationales as an additional source of supervision, by varying the quantity and quality of rationales during training. After controlling for instances where rationales leak the correct answer while not providing additional background knowledge, we find that incorporating only 5% of rationales during training can boost model performance by 47.22% for CoS-E and 57.14% for ECQA during inference. Moreover, we also show that rationale quality matters: compared to crowdsourced rationales, T5-generated rationales provide not only weaker supervision to models, but are also not helpful for humans in aiding model interpretability.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.7183098793029785, 3.430185556411743], "id": 3853, "title": "Data-Efficient Concept Extraction from Pre-trained Language Models for Commonsense Explanation Generation", "authors": "Yanbo Fang and Yongfeng Zhang", "abstract": "Predicting the key explanation concept is essential for generating commonsense explanations. This paper introduces a method to predict the concept from pre-trained language models for commonsense explanation generation. Our experiment found that adopting a language model as the concept extractor and fine-tuning it with 20% training data can improve the quality and accuracy of the generated explanations over multiple evaluation metrics. Compared with conventional methods that search concepts over knowledge graphs, our method does not require the preparation and training models to search through knowledge graphs. To better understand the results from pre-trained language models, we also designed a metric to evaluate the retrieved concepts. Through analysis and experiments, we show the correlation between this metric and the performance of the generators, and we also show the importance of attaching concepts for generating high-quality sentences.", "track": "Natural Language Generation", "label": 6}, {"loc": [1.6340166330337524, 8.558052062988281], "id": 3862, "title": "Unsupervised Domain Adaptation for Joint Information Extraction", "authors": "Nghia Ngo Trung, Bonan Min and Thien Huu Nguyen", "abstract": "Joint Information Extraction (JIE) aims to jointly solve multiple tasks in the Information Extraction pipeline (e.g., entity mention, event trigger, relation, and event argument extraction). Due to their ability to leverage task dependencies and avoid error propagation, JIE models have presented state-of-the-art performance for different IE tasks. However, an issue with current JIE methods is that they only focus on standard supervised learning setting where training and test data comes from the same domain. Cross-domain/domain adaptation learning with training and test data in different domains have not been explored for JIE, thus hindering the application of this technology to different domains in practice. To address this issue, our work introduces the first study to evaluate performance of JIE models in unsupervised domain adaptation setting. In addition, we present a novel method to induce domain-invariant representations for the tasks in JIE, called Domain Adaptation for Joint Information Extraction (DA4JIE). In DA4JIE, we propose an Instance-relational Domain Adaptation mechanism that seeks to align representations of task instances in JIE across domains through a generalized version of domain-adversarial learning approach. We further devise a Context-invariant Structure Learning technique to filter domain-specialized contextual information from induced representations to boost performance of JIE models in new domains. Extensive experiments and analyses demonstrate that DA4JIE can significantly improve out-of-domain performance for current state-of-the-art JIE systems for all IE tasks.", "track": "Information Extraction", "label": 5}, {"loc": [7.909524917602539, 3.3119137287139893], "id": 3866, "title": "Foiling Training-Time Attacks on Neural Machine Translation Systems", "authors": "Jun Wang, Xuanli He, Benjamin Rubinstein and Trevor Cohn", "abstract": "Neural machine translation (NMT) systems are vulnerable to backdoor attacks, whereby an attacker injects poisoned samples into training such that a trained model produces malicious translations. Nevertheless, there is little research on defending against such backdoor attacks in NMT. In this paper, we first show that backdoor attacks that have been successful in text classification are also effective against machine translation tasks. We then present a novel defence method that exploits a key property of most backdoor attacks: namely the asymmetry between the source and target language sentences, which is used to facilitate malicious text insertions, substitutions and suchlike. Our technique uses word alignment coupled with language model scoring to detect outlier tokens, and thus can find and filter out training instances which may contain backdoors. Experimental results demonstrate that our technique can significantly reduce the success of various attacks by up to 89.0%, while not affecting predictive accuracy.", "track": "Machine Translation", "label": 10}, {"loc": [5.605201721191406, 11.145689964294434], "id": 3869, "title": "Learning Action-Effect Dynamics for Hypothetical Vision-Language Reasoning Task", "authors": "Shailaja Keyur Sampat, Pratyay Banerjee, Yezhou Yang and Chitta Baral", "abstract": "'Actions' play a vital role in how humans interact with the world. Thus, autonomous agents that would assist us in everyday tasks also require the capability to perform 'Reasoning about Actions & Change' (RAC). This has been an important research direction in Artificial Intelligence (AI) in general, but the study of RAC with visual and linguistic inputs is relatively recent. The CLEVR_HYP (Sampat et. al., 2021) is one such testbed for hypothetical vision-language reasoning with actions as the key focus. In this work, we propose a novel learning strategy that can improve reasoning about the effects of actions. We implement an encoder-decoder architecture to learn the representation of actions as vectors. We combine the aforementioned encoder-decoder architecture with existing modality parsers and a scene graph question answering model to evaluate our proposed system on the CLEVR_HYP dataset. We conduct thorough experiments to demonstrate the effectiveness of our proposed approach and discuss its advantages over previous baselines in terms of performance, data efficiency, and generalization capability.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.538020610809326, 12.720898628234863], "id": 3897, "title": "Named Entity and Relation Extraction with Multi-Modal Retrieval", "authors": "Xinyu Wang, Jiong Cai, Yong Jiang, Pengjun Xie, Kewei Tu and Wei Lu", "abstract": "Multi-modal named entity recognition (NER) and relation extraction (RE) aim to leverage relevant image information to improve the performance of NER and RE. \nMost existing efforts largely focused on directly extracting potentially useful information from images (such as pixel-level features, identified objects, and associated captions).\nHowever, such extraction processes may not be knowledge aware, resulting in information that may not be highly relevant.\nIn this paper, we propose a novel Multi-modal Retrieval based framework (MoRe).\nMoRe contains a text retrieval module and an image-based retrieval module, which retrieve related knowledge of the input text and image in the knowledge corpus respectively.\nNext, the retrieval results are sent to the textual and visual models respectively for predictions.\nFinally, a Mixture of Experts (MoE) module combines the predictions from the two models to make the final decision.\nOur experiments show that both our textual model and visual model can achieve state-of-the-art performance on four multi-modal NER datasets and one multi-modal RE dataset.\nWith MoE, the model performance can be further improved and our analysis demonstrates the benefits of integrating both textual and visual cues for such tasks.", "track": "Information Extraction", "label": 5}, {"loc": [5.008468151092529, 3.7165839672088623], "id": 3910, "title": "Calibrating Factual Knowledge in Pretrained Language Models", "authors": "Qingxiu Dong, Damai Dai, Yifan Song, Jingjing Xu, Zhifang Sui and Lei Li", "abstract": "Previous literature has proved that Pretrained Language Models~(PLMs) can store factual knowledge. \nHowever, we find that facts stored in the PLMs are not always correct. It motivates us to explore a fundamental question: How do we calibrate factual knowledge in PLMs without re-training from scratch? In this work, we propose a simple and lightweight method CaliNet to achieve this goal. To be specific, we first detect whether PLMs can learn the right facts via a contrastive score between right and fake facts. If not, we then use a lightweight method to add and adapt new parameters to specific factual texts. \nExperiments on the knowledge probing task show the calibration effectiveness and efficiency. \nIn addition, through closed-book question answering, we find that the calibrated PLM possesses knowledge generalization ability after finetuning.\nBeyond the calibration performance, we further investigate and visualize the knowledge calibration mechanism.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.129040718078613, 5.210988998413086], "id": 3915, "title": "MCPG: A Flexible Multi-Level Controllable Framework for Unsupervised Paraphrase Generation", "authors": "Yi Chen, Haiyun Jiang, Lemao Liu, Rui Wang, Shuming Shi and Ruifeng Xu", "abstract": "We present MCPG: a simple and effective\napproach for controllable unsupervised paraphrase generation, which is also flexible to\nadapt to specific domains without extra training. MCPG is controllable in different levels: local lexicons, global semantics, and universal styles. The unsupervised paradigm of\nMCPG combines factual keywords and diversified semantic embeddings as local lexical and\nglobal semantic constraints. The semantic embeddings are diversified by standard dropout,\nwhich is exploited for the first time to increase\ninference diversity by us. Moreover, MCPG\nis qualified with good domain adaptability by\nadding a transfer vector as a universal style constraint, which is refined from the exemplars retrieved from the corpus of the target domain in a\ntraining-free way. Extensive experiments show\nthat MCPG outperforms state-of-the-art unsupervised baselines by a margin. Meanwhile,\nour domain-adapted MCPG also achieves competitive performance with strong supervised\nbaselines even without training.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.899649620056152, 5.91014289855957], "id": 3931, "title": "WordTies: Measuring Word Associations in Language Models via Constrained Sampling", "authors": "Peiran Yao, Tobias Renwick and Denilson Barbosa", "abstract": "Word associations are widely used in psychology to provide insights on how humans perceive and understand concepts. \nComparing word associations in language models (LMs) to those generated by human subjects can serve as a proxy to uncover embedded lexical and commonsense knowledge in language models. \nWhile much helpful work has been done applying direct metrics, such as cosine similarity, to help understand latent spaces, these metrics are symmetric, while human word associativity is asymmetric. \nWe propose WordTies, an algorithm based on constrained sampling from LMs, which allows an asymmetric measurement of associated words, given a cue word as the input. \nComparing to existing methods, word associations found by this method share more overlap with associations provided by humans, and observe the asymmetric property of human associations. \nTo examine possible reasons behind associations, we analyze the knowledge and reasoning behind the word pairings as they are linked to lexical and commonsense knowledge graphs.\nWhen the knowledge about the nature of the word pairings is combined with a probability that the LM has learned that information, we have a new way to examine what information is captured in LMs.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.360987901687622, 4.0825605392456055], "id": 3944, "title": "Exploring The Landscape of Distributional Robustness for Question Answering Models", "authors": "Anas Awadalla, Mitchell Wortsman, Gabriel Ilharco, Sewon Min, Ian Magnusson, Hannaneh Hajishirzi and Ludwig Schmidt", "abstract": "We conduct a large empirical evaluation to investigate the landscape of distributional robustness in question answering. Our investigation spans over 350 models and 16 question answering datasets, including a diverse set of architectures, model sizes, and adaptation methods (e.g., fine-tuning, adapter tuning, in-context learning, etc.). We find that, in many cases, model variations do not affect robustness and in-distribution performance alone determines out-of-distribution performance.\nMoreover, our findings indicate that\ni) zero-shot and in-context learning methods are more robust to distribution shifts than fully fine-tuned models;\nii) few-shot prompt fine-tuned models exhibit better robustness than few-shot fine-tuned span prediction models;\niii) parameter-efficient and robustness enhancing training methods provide no significant robustness improvements.\nIn addition, we publicly release all evaluations to encourage researchers to further analyze robustness trends for question answering models.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.196673393249512, 11.815200805664062], "id": 3945, "title": "Collaborative Reasoning on Multi-Modal Semantic Graphs for Video-Grounded Dialogue Generation", "authors": "Xueliang Zhao, Yuxuan Wang, Chongyang Tao, Chenshuo Wang and Dongyan Zhao", "abstract": "We study video-grounded dialogue generation, where a response is generated based on the dialogue context and the associated video. The primary challenges of this task lie in (1) the difficulty of integrating video data into pre-trained language models (PLMs) which presents obstacles to exploiting the power of large-scale pre-training; and (2) the necessity of taking into account the complementarity of various modalities throughout the reasoning process. Although having made remarkable progress in video-grounded dialogue generation, existing methods still fall short when it comes to integrating with PLMs in a way that allows information from different modalities to complement each other. To alleviate these issues, we first propose extracting pertinent information from videos and turning it into reasoning paths that are acceptable to PLMs. Additionally, we propose a multi-agent reinforcement learning method to collaboratively perform reasoning on different modalities (i.e., video and dialogue context). Empirical experiment results on two public datasets indicate that the proposed model can significantly outperform state-of-the-art models by large margins on both automatic and human evaluations.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.708629608154297, 7.980014324188232], "id": 3958, "title": "Partitioned Gradient Matching-based Data Subset Selection for Compute-Efficient Robust ASR Training", "authors": "Ashish Mittal, Durga Sivasubramanian, Rishabh Iyer, Preethi Jyothi and Ganesh Ramakrishnan", "abstract": "Training state-of-the-art ASR systems such as RNN-T often has a high associated financial and environmental cost. Training with a subset of training data could mitigate this problem if the subset selected could achieve on-par performance with training with the entire dataset. Although there are many data subset selection(DSS) algorithms, direct application to the RNN-T is difficult, especially the DSS algorithms that are adaptive and use learning dynamics such as gradients, as RNN-T tend to have gradients with a significantly larger memory footprint. In this paper, we propose Partitioned Gradient Matching (PGM) a novel distributable DSS algorithm, suitable for massive datasets like those used to train RNN-T. Through extensive experiments on Librispeech 100H and Librispeech 960H, we show that PGM achieves between 3x to 6x speedup with only a very small accuracy degradation (under 1% absolute WER difference). In addition, we demonstrate similar results for PGM even in settings where the training data is corrupted with noise.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [0.510472297668457, 7.165741443634033], "id": 3960, "title": "Adaptive Graph Convolutional Network for Knowledge Graph Entity Alignment", "authors": "Renbo Zhu, Xukun Luo, Meng Ma and Ping Wang", "abstract": "Entity alignment (EA) aims to identify equivalent entities from different Knowledge Graphs (KGs), which is a fundamental task for integrating KGs. Throughout its development, Graph Convolutional Network (GCN) has become one of the mainstream methods for EA. These GCN-based methods learn the representations of entities from two KGs by message passing mechanism and then make alignments via measuring the similarity between entity embeddings. The key idea that GCN works in EA is that entities with similar neighbor structures are highly likely to be aligned. However, the noisy neighbors of entities transfer invalid information, drown out equivalent information, lead to inaccurate entity embeddings, and finally reduce the performance of EA. Based on the Sinkhorn algorithm, we design a reliability measure for potential equivalent entities and propose Adaptive Graph Convolutional Network to deal with neighbor noises in GCN. During the training, the network dynamically updates the adaptive weights of relation triples to weaken the propagation of noises. While calculating entity similarity, it comprehensively considers the self-similarity and neighborhood similarity of the entity pair to alleviate the influence of noises. Furthermore, we design a straightforward but efficient strategy to construct pseudo alignments for unsupervised EA. Extensive experiments on benchmark datasets demonstrate that our framework outperforms the state-of-the-art methods in both supervised and unsupervised settings.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.155917644500732, 5.769199848175049], "id": 3961, "title": "Towards Robust NLG Bias Evaluation with Syntactically-diverse Prompts", "authors": "Arshiya Aggarwal, Jiao Sun and Nanyun Peng", "abstract": "We present a robust methodology for evaluating biases in natural language generation(NLG) systems. Previous works use fixed hand-crafted prefix templates with mentions of various demographic groups to prompt models to generate continuations for bias analysis. These fixed prefix templates could themselves be specific in terms of styles or linguistic structures, which may lead to unreliable fairness conclusions that are not representative of the general trends from tone varying prompts. To study this problem, we paraphrase the prompts with different syntactic structures and use these to evaluate demographic bias in NLG systems. Our results suggest similar overall bias trends but some syntactic structures lead to contradictory conclusions compared to past works. We show that our methodology is more robust and that some syntactic structures prompt more toxic content while others could prompt less biased generation. This suggests the importance of not relying on a fixed syntactic structure and using tone-invariant prompts. Introducing syntactically-diverse prompts can achieve more robust NLG (bias) evaluation.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [3.8473222255706787, 9.227507591247559], "id": 3964, "title": "PcMSP: A Dataset for Scientific Action Graphs Extraction from Polycrystalline Materials Synthesis Procedure Text", "authors": "Xianjun Yang, Ya Zhuo, Julia Zuo, Xinlu Zhang, Stephen Wilson and Linda Petzold", "abstract": "Scientific action graphs extraction from materials synthesis procedures is important for reproducible research, machine automation, and material prediction. But the lack of annotated data has hindered progress in this field. We demonstrate an effort to annotate P}olycrystalline Materials Synthesis Procedures PcMSP from 305 open access scientific articles for the construction of synthesis action graphs. This is a new dataset for material science information extraction that simultaneously contains the synthesis sentences extracted from the experimental paragraphs, as well as the entity mentions and intra-sentence relations. A two-step human annotation and inter-annotator agreement study guarantee the high quality of the PcMSP corpus. We introduce four natural language processing tasks: sentence classification, named entity recognition, relation classification, and joint extraction of entities and relations. Comprehensive experiments validate the effectiveness of several state-of-the-art models for these challenges while leaving large space for improvement. We also perform the error analysis and point out some unique challenges that require further investigation. We will release our annotation scheme, the corpus, and codes to the research community to alleviate the scarcity of labeled data in this domain.", "track": "Information Extraction", "label": 5}, {"loc": [4.773200035095215, 4.7127227783203125], "id": 3965, "title": "Validity Assessment of Legal Will Statements as Natural Language Inference", "authors": "Alice Saebom Kwak, Jacob O. Israelsen, Clayton T. Morrison, Derek E. Bambauer and Mihai Surdeanu", "abstract": "This work introduces a natural language inference (NLI) dataset that focuses on the validity of statements in legal wills. This dataset is unique because: (a) each entailment decision requires three inputs: the statement from the will, the law, and the conditions that hold at the time of the testator's death; and (b) the included texts are longer than the ones in current NLI datasets. We trained eight neural NLI models in this dataset. All the models achieve more than 80% macro F1 and accuracy, which indicates that neural approaches can handle this task reasonably well. However, group accuracy, a stricter evaluation measure that is calculated with a group of positive and negative examples generated from the same statement as a unit, is in mid 80s at best, which suggests that the models' understanding of the task remains superficial. Further ablative analyses and explanation experiments indicate that all three text segments are used for prediction, but some decisions rely on semantically irrelevant tokens. This indicates that overfitting on these longer texts likely happens, and that additional research is required for this task to be solved.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.908806324005127, 9.121292114257812], "id": 3975, "title": "AdaPrompt: Adaptive Model Training for Prompt-based NLP", "authors": "Yulong Chen, Yang Liu, Li Dong, Shuohang Wang, Chenguang Zhu, Michael Zeng and Yue Zhang", "abstract": "Prompt-based learning, with its capability to tackle zero-shot and few-shot NLP tasks, has gained much attention in the community.\nThe main idea is to bridge the gap between NLP downstream tasks and language modeling (LM), by mapping these tasks into natural language prompts, which are then filled by pre-trained language models (PLMs).\nHowever, for prompt learning, there are still two salient gaps between NLP tasks and pretraining.\nFirst, prompt information is not necessarily sufficiently present during LM pre-training. \nSecond, task-specific data are not necessarily well represented during pre-training. \nWe address these two issues by proposing AdaPrompt, adaptively retrieving external data for continual pretraining of PLMs by making use of both task and prompt characteristics. \nIn addition, we make use of knowledge in Natural Language Inference models for deriving adaptive verbalizers.\nExperimental results on five NLP benchmarks show that AdaPrompt can improve over standard PLMs in few-shot settings. \nIn addition, in zero-shot settings, our method outperforms standard prompt-based methods by up to 26.35\\% relative error reduction.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.4626898765563965, 1.9154670238494873], "id": 3988, "title": "Code Generation From Flowcharts with Texts: A Benchmark Dataset and An Approach", "authors": "ZeJie Liu, Xiaoyu Hu, Deyu Zhou, lin li, Xu Zhang and Yanzheng Xiang", "abstract": "Currently, researchers focus on generating codes from the requirement documents. However, current approaches still perform poorly on some requirements needing complex problem-solving skills. In reality, to tackle such complex requirements, instead of directly translating requirement documents into codes, software engineers write codes via unified modeling language diagrams, such as flowcharts, an intermediate tool to analyze and visualize the system. Therefore, we propose a new source code generation task, that is, to generate source code from flowcharts with texts. We manually construct a benchmark dataset containing 320 flowcharts with their corresponding source codes. Obviously, it is not straightforward to employ the current approaches for the new source code generation task since (1) the flowchart is a graph that contains various structures, including loop, selection, and others which is different from texts; (2) the connections between nodes in the flowchart are abundant and diverse which need to be carefully handled. To solve the above problems, we propose a two-stage code generation model. In the first stage, a structure recognition algorithm is employed to transform the flowchart into pseudo-code containing the structural conventions of a typical programming language such as while, if. In the second stage, a code generation model is employed to convert the pseudo-code into code. Experimental results show that the proposed approach can achieve some improvement over the baselines.", "track": "NLP Applications", "label": 0}, {"loc": [5.242216110229492, 12.558262825012207], "id": 3998, "title": "Focus! Relevant and Sufficient Context Selection for News Image Captioning", "authors": "Mingyang Zhou, Grace Luo, Anna Rohrbach and Zhou Yu", "abstract": "News Image Captioning requires describing an image by leveraging additional context derived from a news article. Previous works only coarsely leverage the article to extract the necessary context, which makes it challenging for models to identify relevant events and named entities. In our paper, we first demonstrate that by combining more fine-grained context that captures the key named entities (obtained via an oracle) and the global context that summarizes the news, we can dramatically improve the model's ability to generate accurate news captions. This begs the question, how to automatically extract such key entities from an image? We propose to use pre-trained vision and language retrieval model CLIP to localize the visually grounded entities in the news article, and then capture the non-visual entities via a open relation extraction model. Our experiments demonstrate that by simply selecting better context from the article, we can significantly improve the performance of existing models and achieve the new state-of-the-art performance on multiple benchmarks.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [1.043867826461792, 10.542702674865723], "id": 4014, "title": "Generative Aspect-Based Sentiment Analysis with Contrastive Learning and Expressive Structure", "authors": "Joseph Peper and Lu Wang", "abstract": "Generative models have demonstrated impressive results on Aspect-based Sentiment Analysis (ABSA) tasks, particularly for the emerging task of extracting Aspect-Category-Opinion-Sentiment (ACOS) quadruples. However, these models struggle with implicit sentiment expressions, which are commonly observed in opinionated content such as online reviews. \nIn this work, we introduce GEN-SCL-NAT, which consists of two techniques for improved structured generation for ACOS quadruple extraction. First, we propose GEN-SCL, a supervised contrastive learning objective that aids quadruple prediction by encouraging the model to produce input representations that are discriminable across key input attributes, such as sentiment polarity and the existence of implicit opinions and aspects. Second, we introduce GEN-NAT, a new structured generation format that better adapts pre-trained autoregressive encoder-decoder models to extract quadruples in a generative fashion. \nExperimental results show that GEN-SCL-NAT achieves top performance across three ACOS datasets, averaging 1.48% F1 improvement, with a maximum 1.73% increase on the LAPTOP-L1 dataset. Additionally, we see significant gains on implicit aspect and opinion splits that have been shown as challenging for existing ACOS approaches.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [7.778597831726074, 6.722175598144531], "id": 4025, "title": "Semantic Dependency Parsing with Edge GNNs", "authors": "Songlin Yang and Kewei Tu", "abstract": "Second-order neural parsers have obtained high accuracy in semantic dependency parsing.\n Inspired by the factor graph representation of second-order parsing, we propose edge graph neural networks (E-GNNs). In an E-GNN, each node corresponds to a dependency edge, and the neighbors are defined in terms of sibling, co-parent, and grandparent relationships. We conduct experiments on SemEval 2015 Task 18 English datasets, showing the superior performance of E-GNNs.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [0.8032247424125671, 8.081657409667969], "id": 4031, "title": "Explore Unsupervised Structures in Pretrained Models for Relation Extraction", "authors": "xi yang, Tao Ji and Yuanbin Wu", "abstract": "Syntactic trees have been widely applied in relation extraction (RE). However, since parsing qualities are not stable on different text domains and a pre-defined grammar may not well fit the target relation schema, the introduction of syntactic structures sometimes fails to improve RE performances consistently. In this work, we study RE models with various unsupervised structures mined from pre-trained language models (e.g., BERT). We show that, similar to syntactic trees, unsupervised structures are quite informative for RE task: they are able to obtain competitive (even the best) performance scores on benchmark RE datasets (ACE05, WebNLG, SciERC). We also conduct detailed analyses on their abilities of adapting new RE domains and influence of noise links in those structures. The results suggest that unsupervised structures are reasonable alternatives of commonly used syntactic structures in relation extraction models.", "track": "Information Extraction", "label": 5}, {"loc": [7.5557942390441895, 3.7133142948150635], "id": 4072, "title": "Identifying Human Strategies for Generating Word-Level Adversarial Examples", "authors": "Maximilian Mozes, Bennett Kleinberg and Lewis Griffin", "abstract": "Adversarial examples in NLP are receiving increasing research attention. One line of investigation is the generation of word-level adversarial examples against fine-tuned Transformer models that preserve naturalness and grammaticality. Previous work found that human- and machine-generated adversarial examples are comparable in their naturalness and grammatical correctness. Most notably, humans were able to generate adversarial examples much more effortlessly than automated attacks. In this paper, we provide a detailed analysis of exactly how humans create these adversarial examples. By exploring the behavioural patterns of human workers during the generation process, we identify statistically significant tendencies based on which words humans prefer to select for adversarial replacement (e.g., word frequencies, word saliencies, sentiment) as well as where and when words are replaced in an input sequence. With our findings, we seek to inspire efforts that harness human strategies for more robust NLP models.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.85883903503418, 9.804229736328125], "id": 4098, "title": "Refinement Matters: Textual Description Needs to be Refined for Zero-shot Learning", "authors": "Chandan Gautam, Sethupathy Parameswaran, Vinay Verma, Suresh Sundaram and Savitha Ramasamy", "abstract": "Zero-Shot Learning (ZSL) has shown great promise at the intersection of vision and language, and generative methods for ZSL are predominant owing to their efficiency. Moreover, textual description or attribute plays a critical role in transferring knowledge from the seen to unseen classes in ZSL. Such generative approaches for ZSL are very costly to train and require the class description of the unseen classes during training. In this work, we propose a non-generative gating-based attribute refinement network for ZSL, which achieves similar accuracies to generative methods of ZSL, at a much lower computational cost. The refined attributes are mapped into the visual domain through an attribute embedder, and the whole network is guided by the circle loss and the well-known softmax cross-entropy loss to obtain a robust class embedding. We refer to our approach as Circle loss guided gating-based Attribute-Refinement Network (CARNet). We perform extensive experiments on the five benchmark datasets over the various challenging scenarios viz., Generalized ZSL (GZSL), Continual GZSL (CGZSL), and conventional ZSL. We observe that the CARNet significantly outperforms recent non-generative ZSL methods and most generative ZSL methods in all three settings by a significant margin. Our extensive ablation study disentangles the performance of various components and justifies their importance. The source code is available at https://github.com/Sethup123/CARNet.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.448903560638428, 9.582947731018066], "id": 4122, "title": "SAT: Improving Semi-Supervised Text Classification with Simple Instance-Adaptive Self-Training", "authors": "Hui Chen, Wei Han and Soujanya Poria", "abstract": "Self-training methods have been explored in recent years and have exhibited great performance in improving semi-supervised learning. This work presents a simple instance-adaptive self-training method (SAT) for semi-supervised text classification. SAT first generates two augmented views for each unlabeled data, and then trains a meta learner to automatically identify the relative strength of augmentations based on the similarity between the original view and the augmented views. The weakly-augmented view is fed to the model to produce a pseudo-label and the strongly-augmented view is used to train the model to predict the same pseudo-label. We conducted extensive experiments and analyses on three text classification datasets and found that with varying sizes of labeled training data, SAT consistently shows competitive performance compared to existing semi-supervised learning methods.", "track": "Ethic Concerns:Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [2.9278390407562256, 4.662806510925293], "id": 4133, "title": "Answer Quality Aware Aggregation for Extractive QA Crowdsourcing", "authors": "Peide Zhu, Zhen Wang, Claudia Hauff, Jie Yang and Avishek Anand", "abstract": "Quality control is essential for creating extractive question answering (EQA) datasets via crowdsourcing. Aggregation across answers, i.e. word spans within passages annotated, by different crowd workers is one major focus for ensuring its quality. However, crowd workers cannot reach a consensus on a considerable portion of questions. We introduce a simple yet effective answer aggregation method that takes into account the relations among the answer, question, and context passage. We evaluate answer quality from both the view of question answering model to determine how confident the QA model is about each answer and the view of the answer verification model to determine whether the answer is correct. Then we compute aggregation scores with each answer's quality and its contextual embedding produced by pre-trained language models. The experiments on a large real crowdsourced EQA dataset show that our framework outperforms baselines by around 16% on precision and effectively conduct answer aggregation for extractive QA task.", "track": "Question Answering", "label": 11}, {"loc": [0.6747884154319763, 6.558532238006592], "id": 4136, "title": "Search to Pass Messages for Temporal Knowledge Graph Completion", "authors": "Zhen Wang, Haotong Du, Quanming Yao and Xuelong Li", "abstract": "Completing missing facts is a fundamental task for temporal knowledge graphs (TKGs).\nRecently, graph neural network (GNN) based methods, which can simultaneously explore topological and temporal information, have become the state-of-the-art (SOTA) to complete TKGs. However, these studies are based on hand-designed architectures and fail to explore the diverse topological and temporal properties of TKG.\nTo address this issue, we propose to use neural architecture search (NAS) to design data-specific message passing architecture for TKG completion.\nIn particular, \nwe develop a generalized framework to explore topological and temporal information in TKGs.\nBased on this framework, we design an expressive search space to fully capture various properties of different TKGs. \nMeanwhile, we adopt a search algorithm, which trains a supernet structure by sampling single path for efficient search with less cost.\nWe further conduct extensive experiments on three benchmark datasets. The results show that the searched architectures by our method achieve the SOTA performances.\nBesides, \nthe searched models can also implicitly reveal diverse properties in different TKGs.\nOur code is released in https://github.com/striderdu/SPA.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.6042609214782715, 1.883830189704895], "id": 4145, "title": "Code Vulnerability Detection via Nearest Neighbor Mechanism", "authors": "Qianjin Du, Xiaohui Kuang and Gang Zhao", "abstract": "Code vulnerability detection is a fundamental and challenging task in the software security field. Existing research works aim to learn semantic information from the source code by utilizing NLP technologies. However, in vulnerability detection tasks, some vulnerable samples are very similar to non-vulnerable samples, which are difficult to identify. To address this issue and improve detection performance, we introduce the $k$-nearest neighbor mechanism which retrieves multiple neighbor samples and utilizes label information of retrieved neighbor samples to provide help for model predictions. Besides, we use supervised contrastive learning to make the model learn the discriminative representation and ensure that label information of retrieved neighbor samples is as consistent as possible with the label information of testing samples. Extensive experiments show that our method can achieve obvious performance improvements compared to baseline models.", "track": "NLP Applications", "label": 0}, {"loc": [3.3504326343536377, 4.088421821594238], "id": 4159, "title": "Robust Question Answering against Distribution Shifts with Test-Time Adaption: An Empirical Study", "authors": "Hai Ye, Yuyang Ding, Juntao Li and Hwee Tou Ng", "abstract": "A deployed question answering (QA) model can easily fail when the test data has a distribution shift compared to the training data. Robustness tuning (RT) methods have been widely studied to enhance model robustness against distribution shifts before model deployment. However, can we improve a model after deployment? To answer this question, we evaluate test-time adaptation (TTA) to improve a model after deployment. We first introduce ColdQA, a unified evaluation benchmark for robust QA against text corruption and changes in language and domain. We then evaluate previous TTA methods on ColdQA and compare them to RT methods. We also propose a novel TTA method called online imitation learning (OIL). Through extensive experiments, we find that TTA is comparable to RT methods, and applying TTA after RT can significantly boost the performance on ColdQA. We observe that TTA can help the model better generalize to more distant distributions. Our proposed OIL improves TTA to be more robust to variation in hyper-parameters and test distributions over time.", "track": "Question Answering", "label": 11}, {"loc": [8.136977195739746, 5.216827392578125], "id": 4173, "title": "ParaMac: A General Unsupervised Paraphrase Generation Framework Leveraging Semantic Constraints and Diversifying Mechanisms", "authors": "Jinxin Liu, Jiaxin Shi, Ji Qi, Lei Hou, Juanzi Li and Qi Tian", "abstract": "Paraphrase generation reflects the ability to understand the meaning from the language surface form and rephrase it to other expressions. Recent paraphrase generation works have paid attention to unsupervised approaches based on Pre-trained Language Models (PLMs) to avoid heavy reliance on parallel data by utilizing PLMs' generation ability. However, the generated pairs of existing unsupervised methods are usually weak either in semantic equivalence or expression diversity. In this paper, we present a novel unsupervised paraphrase generation framework called Paraphrase Machine. By employing multi-aspect equivalence constraints and multi-granularity diversifying mechanisms, Paraphrase Machine is able to achieve good semantic equivalence and expressive diversity, producing a high-quality unsupervised paraphrase dataset. Based on this dataset, we train a general paraphrase model, which can be directly applied to rewrite the input sentence of various domains without any fine-tuning, and achieves substantial gains of 9.1% and 3.3% absolutely in BLEU score over previous SOTA on Quora and MSCOCO. By further fine-tuning our model with domain-specific training sets, the improvement can be increased to even 18.0% and 4.6%. Most importantly, by applying it to language understanding and generation tasks under the low-resource setting, we demonstrate that our model can serve as a universal data augmentor to boost the few-shot performance (e.g., average 2.0% gain on GLUE).", "track": "Natural Language Generation", "label": 6}, {"loc": [4.5458292961120605, 7.7278008460998535], "id": 4190, "title": "Semi-supervised New Slot Discovery with Incremental Clustering", "authors": "Yuxia Wu, Lizi Liao, Xueming Qian and Tat-Seng Chua", "abstract": "Discovering new slots is critical to the success of dialogue systems. Most existing methods rely on automatic slot induction in unsupervised fashion or perform domain adaptation across zero or few-shot scenarios. They have difficulties in providing high-quality supervised signals to learn clustering-friendly features, and are limited in effectively transferring the prior knowledge from known slots to new slots. In this work, we propose a Semi-supervised Incremental Clustering method (SIC), to discover new slots with the aid of existing linguistic annotation models and limited known slot data. Specifically, we harvest slot value candidates with NLP model cues and innovatively formulate the slot discovery task under an incremental clustering framework. The model gradually calibrate slot representations under the supervision of generated pseudo-labels, and automatically learns to terminate when no more salient slot remains. Our thorough evaluation on five public datasets demonstrates that it significantly outperforms state-of-the-art models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.601932525634766, 7.575677394866943], "id": 4201, "title": "Con-NAT: Contrastive Non-autoregressive Neural Machine Translation", "authors": "Hao Cheng and Zhihua Zhang", "abstract": "Inspired by the success of contrastive learning in natural language processing, we incorporate contrastive learning into the conditional masked language model which is extensively used in non-autoregressive neural machine translation (NAT). Accordingly, we propose a Contrastive Non-autoregressive Neural Machine Translation (Con-NAT) model. Con-NAT optimizes the similarity of several different representations of the same token in the same sentence. We propose two methods to obtain various representations: Contrastive Common Mask and Contrastive Dropout. Positive pairs are various different representations of the same token, while negative pairs are representations of different tokens. In the feature space, the model with contrastive loss pulls positive pairs together and pushes negative pairs away. We conduct extensive experiments on six translation directions with different data sizes. The results demonstrate that Con-NAT showed a consistent and significant improvement in fully and iterative NAT. Con-NAT is state-of-the-art on WMT'16 Ro-En (34.18 BLEU).", "track": "Machine Translation", "label": 10}, {"loc": [7.699717998504639, 7.823615074157715], "id": 4223, "title": "Improved Knowledge Distillation for Pre-trained Language Models via Knowledge Selection", "authors": "Chenglong Wang, Yi Lu, Yongyu Mu, Yimin Hu, Tong Xiao and Jingbo Zhu", "abstract": "Knowledge distillation addresses the problem of transferring knowledge from a teacher model to a student model.\nIn this process, we typically have multiple types of knowledge extracted from the teacher model.\nThe problem is to make full use of them to train the student model.\nOur preliminary study shows that: (1) not all of the knowledge is necessary for learning a good student model, and (2) knowledge distillation can benefit from certain knowledge at different training steps.\nIn response to these, we propose an actor-critic approach to selecting appropriate knowledge to transfer during the process of knowledge distillation.\nIn addition, we offer a refinement of the training algorithm to ease the computational burden.\nExperimental results on the GLUE datasets show that our method outperforms several strong knowledge distillation baselines significantly.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.2883535623550415, 7.921083450317383], "id": 4235, "title": "Syntactically Robust Training on Partially-Observed Data for Open Information Extraction", "authors": "Ji Qi, Yuxiang Chen, Lei Hou, Juanzi Li and Bin Xu", "abstract": "Open Information Extraction models have shown promising results with sufficient supervision. However, these models face a fundamental challenge that the syntactic distribution of training data is partially observable in comparison to the real world. In this paper, we propose a syntactically robust training framework that enables models to be trained on a syntactic-abundant distribution based on diverse paraphrase generation. To tackle the intrinsic problem of knowledge deformation of paraphrasing, two algorithms based on semantic similarity matching and syntactic tree walking are used to restore the expressionally transformed knowledge. The training framework can be generally applied to other syntactic partial observable domains. Based on the proposed framework, we build a new evaluation set called CaRB-AutoPara, a syntactically diverse dataset consistent with the real-world setting for validating the robustness of the models. Experiments including a thorough analysis show that the performance of the model degrades with the increase of the difference in syntactic distribution, while our framework gives a robust boundary.", "track": "Information Extraction", "label": 5}, {"loc": [10.80473804473877, 9.30996322631836], "id": 4240, "title": "A Benchmark and Dataset for Post-OCR text correction in Sanskrit", "authors": "Ayush Maheshwari, Nikhil Singh, Amrith Krishna and Ganesh Ramakrishnan", "abstract": "Sanskrit is a classical language with about 30 million extant manuscripts fit for digitisation, available in written, printed or scanned-image forms. However, it is still considered to be a low-resource language when it comes to available digital resources. In this work, we release a post-OCR text correction dataset containing around 218,000 sentences, with 1.5 million words, from 30 different books. Texts in Sanskrit are known to be diverse in terms of their linguistic and stylistic usage since Sanskrit was the `lingua francua' for discourse in the Indian subcontinent for about 3 millennia. Keeping this in mind, we release a multi-domain dataset, from areas as diverse as astronomy, medicine and mathematics, with some of them as old as 18 centuries. Further, we release multiple strong baselines as benchmarks for the task, based on pre-trained Seq2Seq language models. We find that our best-performing model, consisting of byte level tokenization in conjunction with phonetic encoding (Byt5+SLP1), yields a 23% point increase over the OCR output in terms of word and character error rates. Moreover, we perform extensive experiments in evaluating these models on their performance and analyse common causes of mispredictions both at the graphemic and lexical levels. Our code and dataset is publicly available at https://github.com/ayushbits/pe-ocr-sanskrit.", "track": "NLP Applications", "label": 0}, {"loc": [2.020648956298828, 7.5630202293396], "id": 4248, "title": "Knowledge-Enhanced Self-Supervised Prototypical Network for Few-Shot Event Detection", "authors": "Kailin Zhao, Xiaolong Jin, Long Bai, Jiafeng Guo and Xueqi Cheng", "abstract": "Prototypical network based joint methods have attracted much attention in few-shot event detection, which carry out event detection in a unified sequence tagging framework. However, these methods suffer from the inaccurate prototype representation problem, due to two main reasons: the number of instances for calculating prototypes is limited; And, they do not well capture the relationships among event prototypes. To deal with this problem, we propose a Knowledge-Enhanced self-supervised Prototypical Network, called KE-PN, for few-shot event detection. KE-PN adopts hybrid rules, which can automatically align event types to an external knowledge base, i.e., FrameNet, to obtain more instances.\nIt proposes a self-supervised learning method to filter out noisy data from enhanced instances. KE-PN is further equipped with an auxiliary event type relationship classification module, which injects the relationship information into representations of event prototypes. Extensive experiments on three benchmark datasets, i.e., FewEvent, MAVEN, and ACE2005 demonstrate the state-of-the-art performance of KE-PN.", "track": "Information Extraction", "label": 5}, {"loc": [7.9518656730651855, 8.596741676330566], "id": 4263, "title": "VarMAE: Pre-training of Variational Masked Autoencoder for Domain-adaptive Language Understanding", "authors": "Dou Hu, Xiaolong Hou, Xiyang Du, Mengyuan Zhou, Lianxin Jiang, Yang Mo and Xiaofeng Shi", "abstract": "Pre-trained language models have been widely applied to standard benchmarks. Due to the flexibility of natural language, the available resources in a certain domain can be restricted to support obtaining precise representation. To address this issue, we propose a novel Transformer-based language model named VarMAE for domain-adaptive language understanding. Under the masked autoencoding objective, we design a context uncertainty learning module to encode the token's context into a smooth latent distribution. The module can produce diverse and well-formed contextual representations. Experiments on science- and finance-domain NLU tasks demonstrate that VarMAE can be efficiently adapted to new domains with limited resources.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [9.62819766998291, 6.357034683227539], "id": 4270, "title": "Exploring Methods for Building Dialects-Mandarin Code-Mixing Corpora: A Case Study in Taiwanese Hokkien", "authors": "Sin-En Lu, Bo-Han Lu, Chao-Yi Lu and Richard Tzong-Han Tsai", "abstract": "In natural language processing (NLP), code-mixing (CM) is a challenging task, especially when the mixed languages include dialects. In Southeast Asian countries such as Singapore, Indonesia, and Malaysia, Hokkien-Mandarin is the most widespread code-mixed language pair among Chinese immigrants, and it is also common in Taiwan. However, dialects such as Hokkien often have a scarcity of resources and the lack of an official writing system, limiting the development of dialect CM research. In this paper, we propose a method to construct a Hokkien-Mandarin CM dataset to mitigate the limitation, overcome the morphological issue under the Sino-Tibetan language family, and offer an efficient Hokkien word segmentation method through a linguistics-based toolkit. Furthermore, we use our proposed dataset and employ transfer learning to train the XLM (cross-lingual language model) for translation tasks. To fit the code-mixing scenario, we adapt XLM slightly. We found that by using linguistic knowledge, rules, and language tags, the model produces good results on CM data translation while maintaining monolingual translation quality.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.105129241943359, 8.96927261352539], "id": 4286, "title": "Recurrence Boosts Diversity! Revisiting Recurrent Latent Variable in Transformer-Based Variational AutoEncoder for Diverse Text Generation", "authors": "Jinyi Hu, Xiaoyuan Yi, Wenhao Li, Maosong Sun and Xing Xie", "abstract": "Variational Auto-Encoder (VAE) has been widely adopted in text generation. Among many variants, recurrent VAE learns token-wise latent variables with each conditioned on the preceding ones, which captures sequential variability better in the era of RNN. However, it is unclear how to incorporate such recurrent dynamics into the recently dominant Transformer due to its parallelism. In this work, we propose TRACE, a Transformer-based recurrent VAE structure. TRACE imposes recurrence on segment-wise latent variables with arbitrarily separated text segments and constructs the posterior distribution with residual parameterization. Besides, we design an acceleration method by approximating idempotent matrices, which allows parallelism while maintaining the conditional dependence of latent variables. We demonstrate that TRACE could deduce a non-zero lower bound of the KL term and enhance the entanglement of each segment and preceding latent variables, providing a theoretical guarantee of generation diversity. Experiments on two unconditional and one conditional generation task show that TRACE achieves significantly improved diversity while maintaining satisfactory generation quality.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.499881267547607, 7.514617919921875], "id": 4301, "title": "Tweet Based Reach Aware Temporal Attention Network for NFT Valuation", "authors": "Ramit Sawhney, Megh Thakkar, Ritesh Soun, Atula Neerkaje, Vasu Sharma, Dipanwita Guhathakurta and Sudheer Chava", "abstract": "Non-Fungible Tokens (NFTs) are a relatively unexplored class of assets. Designing strategies to forecast NFT trends is an intricate task due to its extremely volatile nature. The market is largely driven by public sentiment and \"hype\", which in turn has a high correlation with conversations taking place on social media platforms like Twitter. Prior work done for modelling stock market data does not take into account the extent of impact certain highly influential tweets and their authors can have on the market. Building on these limitations and the nature of the NFT market, we propose a novel reach-aware temporal learning approach to make predictions for forecasting future trends in the NFT market. We perform experiments on a new dataset consisting of over 1.3 million tweets and 180 thousand NFT transactions spanning over 15 NFT collections curated by us. Our model (TA-NFT) outperforms other state-of-the-art methods by an average of 36%. Through extensive quantitative and ablative analysis, we demonstrate the ability of our approach as a practical method for predicting NFT trends.", "track": "NLP Applications", "label": 0}, {"loc": [1.6102356910705566, 8.588443756103516], "id": 4305, "title": "Entity Embedding Completion for Wide-Coverage Entity Disambiguation", "authors": "Daisuke Oba, Ikuya Yamada, Naoki Yoshinaga and Masashi Toyoda", "abstract": "Entity disambiguation (ED) is typically solved by learning to classify a given mention into one of the entities in the model's entity vocabulary by referring to their embeddings. However, this approach cannot address mentions of entities that are not covered by the entity vocabulary. Aiming to enhance the applicability of ED models, we propose a method of extending a state-of-the-art ED model by dynamically computing embeddings of out-of-vocabulary entities. Specifically, our method computes embeddings from entity descriptions and mention contexts. Experiments with standard benchmark datasets show that the extended model performs comparable to or better than existing models whose entity embeddings are trained for all candidate entities as well as embedding-free models. We release our source code and model checkpoints at https://github.com/studio-ousia/steel.", "track": "Information Extraction", "label": 5}, {"loc": [5.595871925354004, 12.727925300598145], "id": 4306, "title": "Entity-level Interaction via Heterogeneous Graph for Multimodal Named Entity Recognition", "authors": "Gang Zhao, Guanting Dong, Yidong Shi, Haolong Yan, Weiran Xu and Si Li", "abstract": "Multimodal Named Entity Recognition (MNER) faces two specific challenges: 1) How to capture useful entity-related visual information. 2) How to alleviate the interference of visual noise. Previous works have gained progress by improving interacting mechanisms or seeking for better visual features. However, existing methods neglect the integrity of entity semantics and conduct cross-modal interaction at token-level, which cuts apart the semantics of entities and makes non-entity tokens easily interfered with by irrelevant visual noise. Thus in this paper, we propose an end-to-end heterogeneous Graph-based Entity-level Interacting model (GEI) for MNER. GEI first utilizes a span detection subtask to obtain entity representations, which serve as the bridge between two modalities. Then, the heterogeneous graph interacting network interacts entity with object nodes to capture entity-related visual information, and fuses it into only entity-associated tokens to rid non-entity tokens of the visual noise. Experiments on two widely used datasets demonstrate the effectiveness of our method. Our code will be available at https://github.com/GangZhao98/GEI.", "track": "Information Extraction", "label": 5}, {"loc": [5.263582229614258, 6.000616073608398], "id": 4339, "title": "Status Biases in Deliberation Online: Evidence from a Randomized Experiment on ChangeMyView", "authors": "Emaad Manzoor, Yohan Jo and Alan M. Montgomery", "abstract": "Status is widely used to incentivize user engagement online. However, visible status indicators could inadvertently bias online deliberation to favor high-status users. In this work, we design and deploy a randomized experiment on the ChangeMyView platform to quantify status biases in deliberation online. We find strong evidence of status bias: hiding status on ChangeMyView increases the persuasion rate of moderate-status users by 84% and decreases the persuasion rate of high-status users by 41% relative to the control group. We also find that the persuasive power of status is moderated by verbosity, suggesting that status is used as an information-processing heuristic under cognitive load. Finally, we find that a user's status influences the argumentation behavior of other users they interact with in a manner that disadvantages low and moderate-status users.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.779438495635986, 6.95224142074585], "id": 4354, "title": "Empathetic and Emotionally Positive Conversation Systems with an Emotion-specific Query-Response Memory", "authors": "Zhiliang Tian, Yinliang Wang, YIPING SONG, Chi ZHANG, Dongkyu Lee, Yingxiu Zhao, Dongsheng Li and Nevin L. Zhang", "abstract": "Emotional conversation systems generate responses for the input queries considering the speaker's emotions in a conversation. Existing emotional conversation systems output emotional responses according to either a given emotion or the user's emotion reflected in the input queries. Following a given emotion may lead to an emotional drift between the given emotion and the conversation state, and following only the user's emotion may aggravate the user's negative feelings if users suffer from a negative mood. In this paper, we propose to generate empathetic responses catering to the user's emotions while leading the conversation to be emotionally positive. Particularly, by abstracting the conversation corpus, we extract and store the different responding strategies for different users' emotions and conversational topics into a memory. We encourage positive emotions in conversation via a sentiment evaluator. We model the memory outputs with a Gaussian mixture distribution and sample a final responding strategy from the distribution. The strategy acts as a condition to a transformer model to generate responses. The experiments verify our model surpasses the baseline methods in appropriateness, diversity, and generating emotionally positive responses.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [2.492269277572632, 8.652530670166016], "id": 4355, "title": "Trial2Vec: Zero-Shot Clinical Trial Document Similarity Search using Self-Supervision", "authors": "Zifeng Wang and Jimeng Sun", "abstract": "Clinical trials are essential for drug development but are extremely expensive and time-consuming to conduct. It is beneficial to study similar historical trials when designing a clinical trial. However, lengthy trial documents and lack of labeled data make trial similarity search difficult. \nWe propose a zero-shot}clinical trial retrieval method, called Trial2Vec, which learns through self-supervision without the need for annotating similar clinical trials. Specifically, the meta-structure of trial documents (e.g., title, eligibility criteria, target disease) along with clinical knowledge (e.g., UMLS knowledge base) are leveraged to automatically generate contrastive samples. Besides, \\method encodes trial documents considering meta-structure thus producing compact embeddings aggregating multi-aspect information from the whole document. We show that our method yields medically interpretable embeddings by visualization and it gets 15\\% average improvement over the best baselines on precision/recall for trial retrieval, which is evaluated on our labeled 1600 trial pairs. In addition, we prove the pretrained embeddings benefit the downstream trial outcome prediction task over 240k trials. Software is available at https://github.com/RyanWangZf/Trial2Vec.", "track": "NLP Applications", "label": 0}, {"loc": [7.742769718170166, 7.791601657867432], "id": 4358, "title": "From Mimicking to Integrating: Knowledge Integration for Pre-Trained Language Models", "authors": "Lei Li, Yankai Lin, Xuancheng Ren, Guangxiang Zhao, Peng Li, Jie Zhou and Xu Sun", "abstract": "Investigating better ways to reuse the released pre-trained language models (PLMs) can significantly reduce the computational cost and the potential environmental side-effects. This paper explores a novel PLM reuse paradigm, Knowledge Integration (KI). Without human annotations available, KI aims to merge the knowledge from different teacher-PLMs, each of which specializes in a different classification problem, into a versatile student model. To achieve this, we first derive the correlation between virtual golden supervision and teacher predictions. We then design a Model Uncertainty--aware Knowledge Integration (MUKI) framework to recover the golden supervision for the student. Specifically, MUKI adopts Monte-Carlo Dropout to estimate model uncertainty for the supervision integration. An instance-wise re-weighting mechanism based on the margin of uncertainty scores is further incorporated, to deal with the potential conflicting supervision from teachers.\nExperimental results demonstrate that MUKI achieves substantial improvements over baselines on benchmark datasets. Further analysis shows that MUKI can generalize well for merging teacher models with heterogeneous architectures, and even teachers major in cross-lingual datasets.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [9.213130950927734, 6.6678595542907715], "id": 4365, "title": "Model and Data Transfer for Cross-Lingual Sequence Labelling in Zero-Resource Settings", "authors": "Iker Garc\u00eda-Ferrero, Rodrigo Agerri and German Rigau", "abstract": "Zero-resource cross-lingual transfer approaches aim to apply supervised models\nfrom a source language to unlabelled target languages. In this paper we perform\nan in-depth study of the two main techniques employed so far for cross-lingual\nzero-resource sequence labelling, based either on data or model transfer.\nAlthough previous research has proposed translation and annotation projection\n(data-based cross-lingual transfer) as an effective technique for cross-lingual\nsequence labelling, in this paper we experimentally demonstrate that high\ncapacity multilingual language models applied in a zero-shot (model-based\ncross-lingual transfer) setting consistently outperform data-based\ncross-lingual transfer approaches. A detailed analysis of our results suggests\nthat this might be due to important differences in language use. More\nspecifically, machine translation often generates a textual signal which is\ndifferent to what the models are exposed to when using gold standard data,\nwhich affects both the fine-tuning and evaluation processes. Our results also\nindicate that data-based cross-lingual transfer approaches remain a competitive\noption when high-capacity multilingual language models are not available.", "track": "Multilinguality", "label": 13}, {"loc": [9.156103134155273, 6.384519100189209], "id": 4367, "title": "Early Guessing for Dialect Identification", "authors": "Vani Kanjirangat, Tanja Samardzic, Fabio Rinaldi and Ljiljana Dolamic", "abstract": "This paper deals with the problem of incre-\nmental dialect identification. Our goal is to\nreliably determine the dialect before the full\nutterance is given as input. The major part\nof the previous research on dialect identification has been model-centric, focusing on performance. We address a new question: How much input is needed to identify a dialect? Our\napproach is a data-centric analysis that results\nin general criteria for finding the shortest input\nneeded to make a plausible guess. Working\nwith three sets of language dialects (Swiss German, Indo-Aryan and Arabic languages), we\nshow that it is possible to generalize across dialects and datasets with two input shortening\ncriteria: model confidence and minimal input\nlength (adjusted for the input type). The source\ncode for experimental analysis can be found at\nGithub.", "track": "Multilinguality", "label": 13}, {"loc": [7.5415520668029785, 3.7444019317626953], "id": 4368, "title": "R-AT: Regularized Adversarial Training for Natural Language Understanding", "authors": "Shiwen Ni, Jiawen Li and Hung-Yu Kao", "abstract": "Currently, adversarial training has become a popular and powerful regularization method in the natural language domain. In this paper, we Regularized Adversarial Training (R-AT) via dropout, which forces the output probability distributions of different sub-models generated by dropout to be consistent under the same adversarial samples. Specifically, we generate adversarial samples by perturbing the word embeddings. For each adversarial sample fed to the model, R-AT minimizes both the adversarial risk and the bidirectional KL-divergence between the adversarial output distributions of two sub-models sampled by dropout. Through extensive experiments on 13 public natural language understanding datasets, we found that R-AT has improvements for many models (e.g., rnn-based, cnn-based, and transformer-based models). For the GLUE benchmark, when R-AT is only applied to the fine-tuning stage, it is able to improve the overall test score of the BERT-base model from 78.3 to 79.6 and the RoBERTa-large model from 88.1 to 88.6. Theoretical analysis reveals that R-AT has potential gradient regularization during the training process. Furthermore, R-AT can reduce the inconsistency between training and testing of models with dropout.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.927786350250244, 9.20006275177002], "id": 4373, "title": "Deep Active Learning with Pretrained Language Models for Filtering Data in Microblogs", "authors": "Payam Karisani, Negin Karisani and Li Xiong", "abstract": "", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.014862060546875, 8.572415351867676], "id": 4375, "title": "Forging Multiple Training Objectives for Pre-trained Language Models via Meta-Learning", "authors": "Hongqiu Wu, Ruixue Ding, Hai Zhao, Boli Chen, Pengjun Xie, Fei Huang and Min Zhang", "abstract": "Multiple pre-training objectives fill the vacancy of the understanding capability of single-objective language modeling, which serves the ultimate purpose of pre-trained language models (PrLMs), generalizing well on a mass of scenarios. However, learning multiple training objectives in a single model is challenging due to the unknown relative significance as well as the potential contrariety between them. Empirical studies have shown that the current objective sampling in an ad-hoc manual setting makes the learned language representation barely converge to the desired optimum. Thus, we propose \\textit{MOMETAS}, a novel adaptive sampler based on meta-learning, which learns the latent sampling pattern on arbitrary pre-training objectives. Such a design is lightweight with negligible additional training overhead. To validate our approach, we adopt five objectives and conduct continual pre-training with BERT-base and BERT-large models, where MOMETAS demonstrates universal performance gain over other rule-based sampling strategies on 14 natural language processing tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.033607482910156, 5.738097667694092], "id": 4378, "title": "ConGen: Unsupervised Control and Generalization Distillation For Sentence Representation", "authors": "Peerat Limkonchotiwat, Wuttikorn Ponwitayarat, Lalita Lowphansirikul, Ekapol Chuangsuwanich and Sarana Nutanong", "abstract": "Sentence representations are essential in many NLP tasks operating at the sentence level.\nRecently, research attention has shifted towards learning how to represent sentences without any annotations, i.e., unsupervised representation learning. \nDespite the benefit of training without supervised data, there is still a performance penalty compared to supervised methods.\nFurthermore, the supervised-unsupervised performance gap widens as we reduce the model size. \nIn this paper, we propose an unsupervised sentence representation method to reduce the supervised-unsupervised performance gap, especially for smaller models. \nUtilizing the concept for knowledge distillation, we derive a distillation framework comprising two training objectives, control and generalize, called ConGen. \nExperiments on semantic textual similarity (STS), text classification (transfer), and natural language inference (NLI) tasks show that ConGen is on par with supervised training even on smaller models.\nFurthermore, our method consistently outperformed competitors on multilingual STS.\nThe code and models are available at https://github.com/KornWtp/ConGen.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.046293258666992, 3.079493522644043], "id": 4379, "title": "Large-Scale Differentially Private BERT", "authors": "Rohan Anil, Badih Ghazi, Vineet Gupta, Ravi Kumar and Pasin Manurangsi", "abstract": "In this work, we study the large-scale pretraining of BERT-Large (Devlin et al., 2019) with differentially private SGD (DP-SGD). We show that combined with a careful implementation, scaling up the batch size to millions (i.e., mega-batches) improves the utility of the DP-SGD step for BERT; we also enhance the training efficiency by using an increasing batch size schedule. Our implementation builds on the recent work of Subramani et al (2020), who demonstrated that the overhead of a DP-SGD step is minimized with effective use of JAX (Bradbury et al., 2018; Frostig et al., 2018) primitives in conjunction with the XLA compiler (XLA team and collaborators, 2017). Our implementation achieves a masked language model accuracy of 60.5% at a batch size of 2M, for epsilon=5, which is a reasonable privacy setting. To put this number in perspective, non-private BERT models achieve an accuracy of \u223c70%.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [10.095953941345215, 7.3653764724731445], "id": 4381, "title": "Improving Zero-Shot Multilingual Translation with Universal Representations and Cross-Mapping", "authors": "Shuhao Gu and Yang Feng", "abstract": "The many-to-many multilingual neural machine translation can translate between language pairs unseen during training, i.e., zero-shot translation. Improving zero-shot translation requires the model to learn universal representations and cross-mapping relationships to transfer the knowledge learned on the supervised directions to the zero-shot directions. In this work, we propose the state mover's distance based on the optimal theory to model the difference of the representations output by the encoder. Then, we bridge the gap between the semantic-equivalent representations of different languages at the token level by minimizing the proposed distance to learn universal representations. Besides, we propose an agreement-based training scheme, which can help the model make consistent predictions based on the semantic-equivalent sentences to learn universal cross-mapping relationships for all translation directions. The experimental results on diverse multilingual datasets show that our method can improve consistently compared with the baseline system and other contrast methods. The analysis proves that our method can better align the semantic space and improve the prediction consistency.", "track": "Machine Translation", "label": 10}, {"loc": [7.4768500328063965, 3.896463394165039], "id": 4383, "title": "Controllable Fake Document Infilling for Cyber Deception", "authors": "Yibo Hu, Yu Lin, Erick Skorupa Parolin, Latifur Khan and Kevin Hamlen", "abstract": "Recent works in cyber deception study how to deter malicious intrusion by generating multiple fake versions of a critical document to impose costs on adversaries who need to identify the correct information. However, existing approaches are context-agnostic, resulting in sub-optimal and unvaried outputs. We propose a novel context-aware model, Fake Document Infilling (FDI), by converting the problem to a controllable mask-then-infill procedure. FDI masks important concepts of varied lengths in the document, then infills a realistic but fake alternative considering both the previous and future contexts. We conduct comprehensive evaluations on technical documents and news stories. Results show that FDI outperforms the baselines in generating highly believable fakes with moderate modification to protect critical information and deceive adversaries.", "track": "Natural Language Generation", "label": 6}, {"loc": [8.371031761169434, 6.50191068649292], "id": 4388, "title": "Weakly Supervised Headline Dependency Parsing", "authors": "Adrian Benton, Tianze Shi, Ozan \u0130rsoy and Igor Malioutov", "abstract": "English news headlines form a register with unique syntactic properties that have been documented in linguistics literature since the 1930s. However, headlines have received surprisingly little attention from the NLP syntactic parsing community. We aim to bridge this gap by providing the first news headline corpus of Universal Dependencies annotated syntactic dependency trees, which enables us to evaluate existing state-of-the-art dependency parsers on news headlines. To improve English news headline parsing accuracies, we develop a projection method to bootstrap silver training data from unlabeled news headline-article lead sentence pairs. Models trained on silver headline parses demonstrate significant improvements in performance over models trained solely on gold-annotated long-form texts. Ultimately, we find that, although projected silver training data improves parser performance across different news outlets, the improvement is moderated by constructions idiosyncratic to outlet.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [3.868257522583008, 9.301432609558105], "id": 4389, "title": "BOOKSUM: A Collection of Datasets for Long-form Narrative Summarization", "authors": "Wojciech Kryscinski, Nazneen Rajani, Divyansh Agarwal, Caiming Xiong and Dragomir Radev", "abstract": "The majority of existing text summarization datasets include short-form source documents that lack long-range causal and temporal dependencies, and often contain strong layout and stylistic biases. While relevant, such datasets will offer limited challenges for future text summarization systems. We address these issues by introducing BOOKSUM, a collection of datasets for long-form narrative summarization. Our dataset covers documents from the literature domain, such as novels, plays and stories, and includes highly abstractive, human written summaries on three levels of granularity of increasing difficulty: paragraph-, chapter-, and book-level. The domain and structure of our dataset poses a unique set of challenges for summarization systems, which include: processing very long documents, non-trivial causal and temporal dependencies, and rich discourse structures. To facilitate future work, we trained and evaluated multiple extractive and abstractive summarization models as baselines for our dataset.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.56272029876709, 8.466367721557617], "id": 4391, "title": "Not All Errors are Equal: Learning Text Generation Metrics using Stratified Error Synthesis", "authors": "Wenda Xu, Yi-Lin Tuan, Yujie Lu, Michael S. Saxon, Lei Li and William Yang Wang", "abstract": "Is it possible to build a general and automatic natural language generation (NLG) evaluation metric? Existing learned metrics either perform unsatisfactorily or are restricted to tasks where large human rating data is already available. We introduce SESCORE, a model-based metric that is highly correlated with human judgements without requiring human annotation, by utilizing a novel, iterative error synthesis and severity scoring pipeline. This pipeline applies a series of plausible errors to raw text and assigns severity labels by simulating human judgements with entailment. We evaluate SESCORE against existing metrics by comparing how their scores correlate with human ratings. SESCORE outperforms all prior unsupervised metrics on multiple diverse NLG tasks including machine translation, image captioning, and WebNLG text generation. For WMT 20/21\nEn-De and Zh-En, SESCORE improve the average Kendall correlation with human judgement from 0.154 to 0.195. SESCORE even achieves comparable performance to the best supervised metric COMET, despite receiving no human annotated training data.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.8610759973526001, 8.150385856628418], "id": 4395, "title": "Summarization as Indirect Supervision for Relation Extraction", "authors": "Keming Lu, I-Hung Hsu, Wenxuan Zhou, Mingyu Derek Ma and Muhao Chen", "abstract": "Relation extraction (RE) models have been challenged by their reliance on training data with expensive annotations. Considering that summarization tasks aim at acquiring concise expressions of synoptical information from the longer context, these tasks naturally align with the objective of RE, i.e., extracting a kind of synoptical information that describes the relation of entity mentions. We present SuRE, which converts RE into a summarization formulation. SuRE leads to more precise and resource-efficient RE based on indirect supervision from summarization tasks. To achieve this goal, we develop sentence and relation conversion techniques that essentially bridge the formulation of summarization and RE tasks. We also incorporate constraint decoding techniques with Trie scoring to further enhance summarization-based RE with robust inference. Experiments on three RE datasets demonstrate the effectiveness of SuRE in both full-dataset and low-resource settings, showing that summarization is a promising source of indirect supervision signals to improve RE models.", "track": "Information Extraction", "label": 5}, {"loc": [2.942837953567505, 6.075201034545898], "id": 4400, "title": "DIGAT: Modeling News Recommendation with Dual-Graph Interaction", "authors": "Zhiming Mao, Jian Li, Hongru Wang, Xingshan Zeng and Kam-Fai Wong", "abstract": "News recommendation (NR) is essential for online news services. Existing NR methods typically adopt a news-user representation learning framework, facing two potential limitations. First, in news encoder, single candidate news encoding suffers from an insufficient semantic information problem. Second, existing graph-based NR methods are promising but lack effective news-user feature interaction, rendering the graph-based recommendation suboptimal. To overcome these limitations, we propose dual-interactive graph attention networks (DIGAT) consisting of news- and user-graph channels. In the news-graph channel, we enrich the semantics of single candidate news by incorporating the semantically relevant news information with a semantic-augmented graph (SAG). In the user-graph channel, multi-level user interests are represented with a news-topic graph. Most notably, we design a dual-graph interaction process to perform effective feature interaction between the news and user graphs, which facilitates accurate news-user representation matching. Experiment results on the benchmark dataset MIND show that DIGAT outperforms existing news recommendation methods. Further ablation studies and analyses validate the effectiveness of (1) semantic-augmented news graph modeling and (2) dual-graph interaction.", "track": "NLP Applications", "label": 0}, {"loc": [8.007988929748535, 9.231302261352539], "id": 4406, "title": "SMASH: Improving SMAll Language Models' Few-SHot Ability with Prompt-Based Distillation", "authors": "Yueqian Wang, Chang Liu, Kai Chen, Xi Wang and Dongyan Zhao", "abstract": "Large-scale language models coupled with prompts have shown remarkable performance on few-shot learning. However, through systematic experiments, we find that the few-shot performance of small language models is poor, and using prompts on them brings fewer improvements than on larger ones. In this paper, we propose SMASH, an approach to improve SMAll language models' few-SHot ability by training on intermediate tasks before prompt-based fine-tuning on downstream tasks. We design intermediate tasks for sentence-pair tasks and sentiment classification tasks by creating training examples with prompt templates similar to downstream tasks using sentences sampled from a large-scale unsupervised corpus, and apply knowledge distillation to distill from outputs of larger pre-trained models as the training objective. We conduct extensive experiments and show that SMASH can make a 6-layer DistilRoBRETa-base achieve comparable performance on few-shot datasets with a 12-layer RoBERTa-base at a low cost.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [2.999054193496704, 4.688065052032471], "id": 4410, "title": "Consecutive Question Generation via Dynamic Multitask Learning", "authors": "Yunji Li, Sujian Li and Xing Shi", "abstract": "In this paper, we propose the task of consecutive question generation (CQG), which generates a set of logically related question-answer pairs to understand a whole passage, with a comprehensive consideration of the aspects including accuracy, coverage, and informativeness.\nTo achieve this, we first examine the four key elements of CQG, i.e., question, answer, rationale, and context history, and propose a novel dynamic multitask framework with one main task generating a question-answer pair, and four auxiliary tasks generating other elements. It directly helps the model generate good questions through both joint training and self-reranking. At the same time, to fully explore the worth-asking information in a given passage, we make use of the reranking losses to sample the rationales and search for the best question series globally.\nFinally, we measure our strategy by QA data augmentation and manual evaluation, as well as a novel application of generated question-answer pairs on DocNLI. We prove that our strategy can improve question generation significantly and benefit multiple related NLP tasks.", "track": "Natural Language Generation", "label": 6}, {"loc": [8.697159767150879, 6.511637210845947], "id": 4411, "title": "Subword Segmental Language Modelling for Nguni Languages", "authors": "Francois Meyer and Jan Buys", "abstract": "Subwords have become the standard units of text in NLP, enabling efficient open-vocabulary models. With algorithms like byte-pair encoding (BPE), subword segmentation is viewed as a preprocessing step applied to the corpus before training. This can lead to sub-optimal segmentations for low-resource languages with complex morphologies. We propose a subword segmental language model (SSLM) that learns how to segment words while being trained for autoregressive language modelling. By unifying subword segmentation and language modelling, our model learns subwords that optimise LM performance. We train our model on the 4 Nguni languages of South Africa. These are low-resource agglutinative languages, so subword information is critical. As an LM, SSLM outperforms existing approaches such as BPE-based models on average across the 4 languages. Furthermore, it outperforms standard subword segmenters on unsupervised morphological segmentation. We also train our model as a word-level sequence model, resulting in an unsupervised morphological segmenter that outperforms existing methods by a large margin for all 4 languages. Our results show that learning subword segmentation is an effective alternative to existing subword segmenters, enabling the model to discover morpheme-like subwords that improve its LM capabilities.", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [5.415914535522461, 12.141900062561035], "id": 4417, "title": "Towards Robust Visual Question Answering: Making the Most of Biased Samples via Contrastive Learning", "authors": "Qingyi Si, Yuanxin LIU, Fandong Meng, Zheng Lin, Peng Fu, Yanan Cao, Weiping Wang and Jie Zhou", "abstract": "Models for Visual Question Answering (VQA) often rely on the spurious correlations, i.e., the language priors, that appear in the biased samples of training set, which make them brittle against the out-of-distribution (OOD) test data. Recent methods have achieved promising progress in overcoming this problem by reducing the impact of biased samples on model training. However, these models reveal a trade-off that the improvements on OOD data severely sacrifice the performance on the in-distribution (ID) data (which is dominated by the biased samples). Therefore, we propose a novel contrastive learning approach, MMBS, for building robust VQA models by Making the Most of Biased Samples. Specifically, we construct positive samples for contrastive learning by eliminating the information related to spurious correlation from the original training samples and explore several strategies to use the constructed positive samples for training. Instead of undermining the importance of biased samples in model training, our approach precisely exploits the biased samples for unbiased information that contributes to reasoning. The proposed method is compatible with various VQA backbones. We validate our contributions by achieving competitive performance on the OOD dataset VQA-CP v2 while preserving robust performance on the ID dataset VQA v2.", "track": "Question Answering", "label": 11}, {"loc": [5.2876434326171875, 8.902695655822754], "id": 4426, "title": "P3LM: Probabilistically Permuted Prophet Language Modeling for Generative Pre-Training", "authors": "Junwei Bao, Yifan Wang, ying jiangyong, Yeyun Gong, Jing Zhao, Youzheng Wu and Xiaodong He", "abstract": "Conventional autoregressive left-to-right (L2R) sequence generation faces two issues during decoding: limited to unidirectional target sequence modeling, and constrained on strong local dependencies.\nTo address the aforementioned problem, we propose P3LM, a probabilistically permuted prophet language model, which strengthens the modeling of bidirectional information and long token dependencies for sequence generation.\nSpecifically, P3LM learns to generate tokens in permuted order upon an order-aware transformer decoder, as well as to generate the corresponding future N tokens with a multi-stream attention mechanism.\nExtensive experiments are conducted on the GLGE benchmark, which includes four datasets for summarization, two for question generation, one for conversational question answering, and one for dialog response generation, where P3LM achieves state-of-the-art results compared with strong publicly available generative pre-training methods.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.482527256011963, 7.5004425048828125], "id": 4429, "title": "Holistic Sentence Embeddings for Better Out-of-Distribution Detection", "authors": "Sishuo Chen, Xiaohan Bi, Rundong Gao and Xu Sun", "abstract": "Detecting out-of-distribution (OOD) instances is significant for the safe deployment of NLP models. Among recent textual OOD detection works based on pretrained language models (PLMs), distance-based methods have shown superior performance. However, they estimate sample distance scores in the last-layer CLS embedding space and thus do not make full use of linguistic information underlying in PLMs. To address the issue, we propose to boost OOD detection by deriving more holistic sentence embeddings. On the basis of the observations that token averaging and layer combination contribute to improving OOD detection, we propose a simple embedding approach named Avg-Avg, which averages all token representations from each intermediate layer as the sentence embedding and significantly surpasses the state-of-the-art on a comprehensive suite of benchmarks by a 9.33% FAR95 margin. Furthermore, our analysis demonstrates that it indeed helps preserve general linguistic knowledge in fine-tuned PLMs and substantially benefits detecting background shifts. The simple yet effective embedding method can be applied to fine-tuned PLMs with negligible extra costs, providing a free gain in OOD detection. Our code is available at https://github.com/lancopku/Avg-Avg.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.770755410194397, 5.299342155456543], "id": 4432, "title": "MuGER2: Multi-Granularity Evidence Retrieval and Reasoning for Hybrid Question Answering", "authors": "Yingyao Wang, Junwei Bao, Chaoqun Duan, Youzheng Wu, Xiaodong He and Tiejun Zhao", "abstract": "Hybrid question answering (HQA) aims to answer questions over heterogeneous data, including tables and passages linked to table cells. The heterogeneous data can provide different granularity evidence to HQA models, e.t., column, row, cell, and link. Conventional HQA models usually retrieve coarse- or fine-grained evidence to reason the answer. Through comparison, we find that coarse-grained evidence is easier to retrieve but contributes less to the reasoner, while fine-grained evidence is the opposite. To preserve the advantage and eliminate the disadvantage of different granularity evidence, we propose MuGER2, a Multi-Granularity Evidence Retrieval and Reasoning approach. In evidence retrieval, a unified retriever is designed to learn the multi-granularity evidence from the heterogeneous data. In answer reasoning, an evidence selector is proposed to navigate the fine-grained evidence for the answer reader based on the learned multi-granularity evidence. Experiment results on the HybridQA dataset show that MuGER2 significantly boosts the HQA performance. Further ablation analysis verifies the effectiveness of both the retrieval and reasoning designs.", "track": "Question Answering", "label": 11}, {"loc": [9.560885429382324, 6.354761123657227], "id": 4433, "title": "EntityCS: Improving Zero-Shot Cross-lingual Transfer with Entity-Centric Code Switching", "authors": "Chenxi Whitehouse, Fenia Christopoulou and Ignacio Iacobacci", "abstract": "Accurate alignment between languages is fundamental for improving cross-lingual pre-trained language models (XLMs). Motivated by the natural phenomenon of code-switching (CS) in multilingual speakers, CS has been used as an effective data augmentation method that offers language alignment at word- or phrase-level, in contrast to sentence-level via parallel instances. Existing approaches either use dictionaries or parallel sentences with word-alignment to generate CS data by randomly switching words in a sentence. However, such methods can be suboptimal as dictionaries disregard semantics, and syntax might become invalid after random word switching. In this work, we propose EntityCS, a method that focuses on Entity-level Code-Switching to capture fine-grained cross-lingual semantics without corrupting syntax. We use Wikidata and the English Wikipedia to construct an entity-centric CS corpus by switching entities to their counterparts in other languages. We further propose entity-oriented masking strategies during intermediate model training on the EntityCS corpus for improving entity prediction. Evaluation of the trained models on four entity-centric downstream tasks shows consistent improvements over the baseline with a notable increase of 10% in Fact Retrieval. We release the corpus and models to assist research on code-switching and enriching XLMs with external knowledge.", "track": "Multilinguality", "label": 13}, {"loc": [4.737878799438477, 8.774112701416016], "id": 4444, "title": "MBTI Personality Prediction for Fictional Characters Using Movie Scripts", "authors": "Yisi Sang, Xiangyang Mou, Mo Yu, Dakuo Wang, Jing Li and Jeffrey Stanton", "abstract": "An NLP model that understands stories should be able to understand the characters in them. To support the development of neural models for this purpose, we construct a benchmark, Story2Personality. The task is to predict a movie character's MBTI or Big 5 personality types based on the narratives of the character. Experiments show that our task is challenging for the existing text classification models, as none is able to largely outperform random guesses. We further proposed a multi-view model for personality prediction using both verbal and non-verbal descriptions, which gives improvement compared to using only verbal descriptions. The uniqueness and challenges in our dataset call for the development of narrative comprehension techniques from the perspective of understanding characters.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.7458555698394775, 5.892829895019531], "id": 4451, "title": "A Simple and Strong Baseline for End-to-End Neural RST-style Discourse Parsing", "authors": "Naoki Kobayashi, Tsutomu Hirao, Hidetaka Kamigaito, Manabu Okumura and Masaaki Nagata", "abstract": "To promote and further develop RST-style discourse parsing models, we need a strong baseline that can be regarded as a reference for reporting reliable experimental results. \nThis paper explores a strong baseline by integrating existing simple parsing strategies, top-down and bottom-up, with various transformer-based pre-trained language models.\nThe experimental results obtained from two benchmark datasets demonstrate that the parsing performance strongly relies on the pre-trained language models rather than the parsing strategies.\nIn particular, the bottom-up parser achieves large performance gains compared to the current best parser when employing DeBERTa.\nWe further reveal that language models with a span-masking scheme especially boost the parsing performance through our analysis within intra- and multi-sentential parsing, and nuclearity prediction.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [6.982294082641602, 6.36100959777832], "id": 4462, "title": "Probing for Constituency Structure in Neural Language Models", "authors": "David Arps, Younes Samih, Laura Kallmeyer and Hassan Sajjad", "abstract": "In this paper, we investigate to which extent contextual neural language models (LMs) implicitly learn syntactic structure. More concretely, we focus on constituent structure as represented in the Penn Treebank (PTB). Using standard probing techniques based on diagnostic classifiers, we assess the accuracy of representing constituents of different categories within the neuron activations of a LM such as RoBERTa. In order to make sure that our probe focuses on syntactic knowledge and not on implicit semantic generalizations, we also experiment on a PTB version that is obtained by randomly replacing constituents with each other while keeping syntactic structure, i.e., a semantically ill-formed but syntactically well-formed version of the PTB. We find that 4 pretrained transfomer LMs obtain high performance on our probing tasks even on manipulated data, suggesting that semantic and syntactic knowledge in their representations can be separated and that constituency information is in fact learned by the LM. Moreover, we show that a complete constituency tree can be linearly separated from LM representations.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.2999982833862305, 4.9409074783325195], "id": 4467, "title": "Table-To-Text generation and pre-training with TabT5", "authors": "Ewa Andrejczuk, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene and Yasemin Altun", "abstract": "Encoder-only transformer models have been successfully applied to different table understanding tasks, as in TAPAS. A major limitation of these architectures is that they are constrained to classification-like tasks such as cell selection or entailment detection. We present TabT5, an encoder-decoder model that generates natural language text based on tables and textual inputs. TabT5 overcomes the encoder-only limitation by incorporating a decoder component and leverages the input structure with table specific embeddings and pre-training. TabT5 achieves new state-of-the-art results on several domains, including spreadsheet formula prediction with a 15% increase in sequence accuracy, QA with a 2.5% increase in sequence accuracy and data-to-text generation with a 2.5% increase in BLEU.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.584383010864258, 9.727012634277344], "id": 4470, "title": "A POMDP Dialogue Policy with 3-way Grounding and Adaptive Sensing for Learning through Communication", "authors": "Maryam Zare, Alan R. Wagner and Rebecca Jane Passonneau", "abstract": "Agents to assist with rescue, surgery, and similar activities could collaborate better with humans if they could learn new strategic behaviors through communication. We introduce a novel POMDP dialogue policy for learning from people. The policy has 3-way grounding of language in the shared physical context, the dialogue context, and persistent knowledge. It can learn distinct but related games, and can continue learning across dialogues for complex games. A novel sensing component supports adaptation to information-sharing differences across people. The single policy performs better than oracle policies customized to specific games and information behavior.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.900187969207764, 3.453322172164917], "id": 4471, "title": "PaCo: Preconditions Attributed to Commonsense Knowledge", "authors": "Ehsan Qasemi, Filip Ilievski, Muhao Chen and Pedro Szekely", "abstract": "Humans can seamlessly reason with circumstantial preconditions of commonsense knowledge. We understand that a glass is used for drinking water, unless the glass is broken or the water is toxic. Despite state-of-the-art (SOTA) language models' (LMs) impressive performance on inferring commonsense knowledge, it is unclear whether they understand the circumstantial preconditions. To address this gap, we propose a novel challenge of reasoning with circumstantial preconditions. We collect a dataset, called PaCo, consisting of 12.4 thousand preconditions of commonsense statements expressed in natural language. Based on this dataset, we create three canonical evaluation tasks and use them to examine the capability of existing LMs to understand situational preconditions. Our results reveal a 10-30% gap between machine and human performance on our tasks, which shows that reasoning with preconditions is an open challenge.", "track": "Resources and Evaluation", "label": 1}, {"loc": [1.9442188739776611, 8.694657325744629], "id": 4477, "title": "Improving Few-Shot Domain Transfer for Named Entity Disambiguation with Pattern Exploitation", "authors": "Philip Blair and Kfir Bar", "abstract": "Named entity disambiguation (NED) is a critical subtask of entity linking, which seeks to connect knowledge base entities with textual mentions of those entities. Naturally, the performance of a model depends on the domain it was trained on; thus, reducing the amount of data required to train models is advantageous. In this work, we leverage recent research on pattern exploitation for NED and explore whether it can reduce the amount of data required for domain adaptation by reformulating the disambiguation task as a masked language modeling problem. Using ADAPET (Tam et al., 2021), which implements a new approach for few-shot learning using fine-tuned transformer-based language models, we produce an NED model which yields, without any sacrifice of in-domain accuracy, a 7% improvement in zero-shot cross-domain performance as evaluated on NEDMed, a new NED dataset of mental health news which we release with this work.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.964476585388184, 5.926020622253418], "id": 4479, "title": "Capturing Topic Framing via Masked Language Modeling", "authors": "Xiaobo Guo, Weicheng Ma and Soroush Vosoughi", "abstract": "Differential framing of issues can lead to divergent world views on important issues. This is especially true in domains where the information presented can reach a large audience, such as traditional and social media. Scalable and reliable measurement of such differential framing is an important first step in addressing them. In this work, based on the intuition that framing affects the tone and word choices in written language, we propose a framework for modeling the differential framing of issues through masked token prediction via large-scale fine-tuned language models (LMs). Specifically, we explore three key factors for our framework: 1) prompt generation methods for the masked token prediction; 2) methods for normalizing the output of fine-tuned LMs; 3) robustness to the choice of pre-trained LMs used for fine-tuning. Through experiments on a dataset of articles from traditional media outlets covering five diverse and politically polarized topics, we show that our framework can capture differential framing of these topics with high reliability.", "track": "NLP Applications", "label": 0}, {"loc": [4.667919158935547, 5.000764846801758], "id": 4480, "title": "WANLI: Worker and AI Collaboration for Natural Language Inference Dataset Creation", "authors": "Alisa Liu, Swabha Swayamdipta, Noah A. Smith and Yejin Choi", "abstract": "A recurring challenge of crowdsourcing NLP datasets at scale is that human writers often rely on repetitive patterns when crafting examples, leading to a lack of linguistic diversity. We introduce a novel approach for dataset creation based on worker and AI collaboration, which brings together the generative strength of language models and the evaluative strength of humans. Starting with an existing dataset, MultiNLI for natural language inference (NLI), our approach uses dataset cartography to automatically identify examples that demonstrate challenging reasoning patterns, and instructs GPT-3 to compose new examples with similar patterns. Machine generated examples are then automatically filtered, and finally revised and labeled by human crowdworkers. The resulting dataset, WANLI, consists of 107,885 NLI examples and presents unique empirical strengths over existing NLI datasets. Remarkably, training a model on WANLI improves performance on eight out-of-domain test sets we consider, including by 11% on HANS and 9% on Adversarial NLI, compared to training on the 4x larger MultiNLI. Moreover, it continues to be more effective than MultiNLI augmented with other NLI datasets. Our results demonstrate the promise of leveraging natural language generation techniques and re-imagining the role of humans in the dataset creation process.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.528929233551025, 8.837152481079102], "id": 4483, "title": "Sequentially Controlled Text Generation", "authors": "Alexander Spangher, Yao Ming, Xinyu Hua and Nanyun Peng", "abstract": "While GPT-2 generates sentences that are remarkably human-like, longer documents can ramble and do not follow human-like writing structure. We study the problem of imposing structure on long-range text. We propose a novel controlled text generation task, sequentially controlled text generation, and identify a dataset, NewsDiscourse as a starting point for this task. We develop a sequential controlled text generation pipeline with generation and editing. We test different degrees of structural awareness and show that, in general, more structural awareness results in higher control- accuracy, grammaticality, coherency and topicality, approaching human-level writing performance.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.625174522399902, 9.655472755432129], "id": 4486, "title": "Revisiting the Roles of \"Text\u201d in Text Games", "authors": "Yi Gu, Shunyu Yao, Chuang Gan, Josh Tenenbaum and Mo Yu", "abstract": "Text games present opportunities for natural language understanding (NLU) methods to tackle reinforcement learning (RL) challenges. However, recent work has questioned the necessity of NLU by showing random text hashes could perform decently. In this paper, we pursue a fine-grained investigation into the roles of text in the face of different RL challenges, and reconcile that semantic and non-semantic language representations could be complementary rather than contrasting. Concretely, we propose a simple scheme to extract relevant contextual information into an approximate state hash as extra input for an RNN-based text agent. Such a lightweight plug-in achieves competitive performance with state-of-the-art text agents using advanced NLU techniques such as knowledge graph and passage retrieval, suggesting non-NLU methods might suffice to tackle the challenge of partial observability. However, if we remove RNN encoders and use approximate or even ground-truth state hash alone, the model performs miserably, which confirms the importance of semantic function approximation to tackle the challenge of combinatorially large observation and action spaces. Our findings and analysis provide new insights for designing better text game task setups and agents.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.563857078552246, 8.47903060913086], "id": 4487, "title": "FPT: Improving Prompt Tuning Efficiency via Progressive Training", "authors": "Yufei Huang, Yujia Qin, Huadong Wang, Yichun Yin, Maosong Sun, Zhiyuan Liu and Qun Liu", "abstract": "Recently, prompt tuning (PT) has gained increasing attention as a parameter-efficient way of tuning pre-trained language models (PLMs). Despite extensively reducing the number of tunable parameters and achieving satisfying performance, PT is training-inefficient due to its slow convergence. To improve PT's training efficiency, we first make some novel observations about the prompt transferability of \"partial PLMs\u201d, which are defined by compressing a PLM in depth or width. We observe that the soft prompts learned by different partial PLMs of various sizes are similar in the parameter space, implying that these soft prompts could potentially be transferred among partial PLMs. Inspired by these observations, we propose Fast Prompt Tuning (FPT), which starts by conducting PT using a small-scale partial PLM, and then progressively expands its depth and width until the full-model size. After each expansion, we recycle the previously learned soft prompts as initialization for the enlarged partial PLM and then proceed PT. We demonstrate the feasibility of FPT on 5 tasks and show that FPT could save over 30% training computations while achieving comparable performance. The codes are publicly available at https://github.com/thunlp/FastPromptTuning.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [1.6058895587921143, 8.621681213378906], "id": 4490, "title": "Prompt-learning for Fine-grained Entity Typing", "authors": "Ning Ding, Yulin Chen, Xu Han, Guangwei Xu, Xiaobin Wang, Pengjun Xie, Haitao Zheng, Zhiyuan Liu, Juanzi Li and Hong-Gee Kim", "abstract": "As an effective approach to adapting pre-trained language models (PLMs) for specific tasks, prompt-learning has recently attracted much attention from researchers. By using cloze-style language prompts to stimulate the versatile knowledge of PLMs, prompt-learning can achieve promising results on a series of NLP tasks, such as natural language inference, sentiment classification, and knowledge probing. In this work, we investigate the application of prompt-learning on fine-grained entity typing in fully supervised, few-shot, and zero-shot scenarios. \nWe first develop a simple and effective prompt-learning pipeline by constructing entity-oriented verbalizers and templates and conducting masked language modeling. Further, to tackle the zero-shot regime, we propose a self-supervised strategy that carries out distribution-level optimization in prompt-learning to automatically summarize the information of entity types. Extensive experiments on four fine-grained entity typing benchmarks under fully supervised, few-shot, and zero-shot settings show the effectiveness of the prompt-learning paradigm and further make a powerful alternative to vanilla fine-tuning.", "track": "Information Extraction", "label": 5}, {"loc": [8.6024808883667, 6.765219688415527], "id": 4492, "title": "TransLIST: A Transformer-Based Linguistically Informed Sanskrit Tokenizer", "authors": "Jivnesh Sandhan, Rathin Singha, Narein Rao, SUVENDU SAMANTA, Laxmidhar Behera and Pawan Goyal", "abstract": "Sanskrit Word Segmentation (SWS) is essential in making digitized texts available and in deploying downstream tasks. It is, however, non-trivial because of the sandhi phenomenon that modifies the characters at the word boundaries, and needs special treatment. Existing lexicon driven approaches for SWS make use of Sanskrit Heritage Reader, a lexicon-driven shallow parser, to generate the complete candidate solution space, over which various methods are applied to produce the most valid solution. However, these approaches fail while encountering out-of-vocabulary tokens. On the other hand, purely engineering methods for SWS have made use of recent advances in deep learning, but cannot make use of the latent word information on availability. \n\nTo mitigate the shortcomings of both families of approaches, we propose Transformer based Linguistically Informed Sanskrit Tokenizer (TransLIST) consisting of (1) a module that encodes the character input along with latent-word information, which takes into account the sandhi phenomenon specific to SWS and is apt to work with partial or no candidate solutions, (2) a novel soft-masked attention to prioritize potential candidate words and (3) a novel path ranking algorithm to rectify the corrupted predictions. Experiments on the benchmark datasets for SWS show that TransLIST outperforms the current state-of-the-art system by an average 7.2 points absolute gain in terms of perfect match (PM) metric.", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [8.12582015991211, 3.0134832859039307], "id": 4498, "title": "Fair NLP Models with Differentially Private Text Encoders", "authors": "Gaurav Maheshwari, Pascal Denis, Mikaela Keller and Aur\u00e9lien Bellet", "abstract": "Encoded text representations often capture sensitive attributes about individuals (e.g., race or gender), which raise privacy concerns and can make downstream models unfair to certain groups. In this work, we propose FEDERATE, an approach that combines ideas from differential privacy and adversarial training to learn private text representations which also induces fairer models. We empirically evaluate the trade-off between the privacy of the representations and the fairness and accuracy of the downstream model on four NLP datasets. Our results show that FEDERATE consistently improves upon previous methods, and thus suggest that privacy and fairness can positively reinforce each other.", "track": "Ethics", "label": 21}, {"loc": [10.621461868286133, 7.153485298156738], "id": 4500, "title": "Modeling Context With Linear Attention for Scalable Document-Level Translation", "authors": "Zhaofeng Wu, Hao Peng, Nikolaos Pappas and Noah A. Smith", "abstract": "Document-level machine translation leverages inter-sentence dependencies to produce more coherent and consistent translations. However, these models, predominantly based on transformers, are difficult to scale to long documents as their attention layers have quadratic complexity in the sequence length. Recent efforts on efficient attention improve scalability, but their effect on document translation remains unexplored. In this work, we investigate the efficacy of a recent linear attention model by Peng et al. (2021) on document translation and augment it with a sentential gate to promote a recency inductive bias. We evaluate the model on IWSLT 2015 and OpenSubtitles 2018 against the transformer, demonstrating substantially increased decoding speed on long sequences with similar or better BLEU scores. We show that sentential gating further improves translation quality on IWSLT.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.130899429321289, 7.1678385734558105], "id": 4503, "title": "What do Large Language Models Learn beyond Language?", "authors": "Avinash Madasu and Shashank Srivastava", "abstract": "Large language models (LMs) have rapidly become a mainstay in Natural Language Processing. These models are known to acquire rich linguistic knowledge from training on large amounts of text. In this paper, we investigate if pre-training on text also confers these models with helpful `inductive biases' for non-linguistic reasoning. On a set of 19 diverse non-linguistic tasks involving quantitative computations, recognizing regular expressions and reasoning over strings. We find that pretrained models significantly outperform comparable non-pretrained neural models. This remains true also in experiments with training non-pretrained models with fewer parameters to account for model regularization effects. We further explore the effect of text domain on LMs by pretraining models from text from different domains and provenances. Our experiments surprisingly reveal that the positive effects of pre-training persist even when pretraining on multi-lingual text or computer code, and even for text generated from synthetic languages. Our findings suggest a hithertho unexplored deep connection between pre-training and inductive learning abilities of language models", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.2368741035461426, 4.806027889251709], "id": 4507, "title": "CONSISTENT: Open-Ended Question Generation From News Articles", "authors": "Tuhin Chakrabarty, Justin Lewis and Smaranda Muresan", "abstract": "Recent work on question generation has largely focused on factoid questions such as who, what,\nwhere, when about basic facts. Generating open-ended why, how, what, etc. questions that\nrequire long-form answers have proven more difficult. To facilitate the generation of openended questions, we propose CONSISTENT, a new end-to-end system for generating openended questions that are answerable from and faithful to the input text. Using news articles as\na trustworthy foundation for experimentation, we demonstrate our model's strength over several baselines using both automatic and human based evaluations. We contribute an evaluation\ndataset of expert-generated open-ended questions. We discuss potential downstream applications for news media organizations.", "track": "NLP Applications", "label": 0}, {"loc": [5.895401954650879, 8.80286693572998], "id": 4511, "title": "Efficient (Soft) Q-Learning for Text Generation with Limited Good Data", "authors": "Han Guo, Bowen Tan, Zhengzhong Liu, Eric Xing and Zhiting Hu", "abstract": "Maximum likelihood estimation (MLE) is the predominant algorithm for training text generation models. This paradigm relies on direct supervision examples, which is not applicable to many emerging applications, such as generating adversarial attacks or generating prompts to control language models. Reinforcement learning (RL) on the other hand offers a more flexible solution by allowing users to plug in arbitrary task metrics as reward. Yet previous RL algorithms for text generation, such as policy gradient (on-policy RL) and Q-learning (off-policy RL), are often notoriously inefficient or unstable to train due to the large sequence space and the sparse reward received only at the end of sequences. In this paper, we introduce a new RL formulation for text generation from the soft Q-learning (SQL) perspective. It enables us to draw from the latest RL advances, such as path consistency learning, to combine the best of on-/off-policy updates, and learn effectively from sparse reward. We apply the approach to a wide range of novel text generation tasks, including learning from noisy/negative examples, adversarial attacks, and prompt generation. Experiments show our approach consistently outperforms both task-specialized algorithms and the previous RL methods.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.768237590789795, 11.879371643066406], "id": 4516, "title": "Lexi: Self-Supervised Learning of the UI Language", "authors": "Pratyay Banerjee, Shweti Mahajan, Kushal Arora, Chitta Baral and Oriana Riva", "abstract": "Humans can learn to operate the user interface (UI) of an application by reading an instruction manual or how-to guide. Along with text, these resources include visual content such as UI screenshots and images of application icons referenced in the text. We explore how to leverage this data to learn generic visio-linguistic representations of UI screens and their components. These representations are useful in many real applications, such as accessibility, voice navigation, and task automation. Prior UI representation models rely on UI metadata (UI trees and accessibility labels), which is often missing, incompletely defined, or not accessible. We avoid such a dependency, and propose Lexi, a pre-trained vision and language model designed to handle the unique features of UI screens, including their text richness and context sensitivity. To train Lexi we curate the UICaption dataset consisting of 114k UI images paired with descriptions of their functionality. We evaluate Lexi on four tasks: UI action entailment, instruction-based UI image retrieval, grounding referring expressions, and UI entity recognition.", "track": "NLP Applications", "label": 0}, {"loc": [4.617684841156006, 9.1382417678833], "id": 4517, "title": "Inferring the Reader: Guiding Automated Story Generation with Commonsense Reasoning", "authors": "Xiangyu Peng, Siyan Li, Sarah Wiegreffe and Mark Riedl", "abstract": "Transformer-based language model approaches to automated story generation currently provide state-of-the-art results. However, they still suffer from plot incoherence when generating\nnarratives over time, and critically lack basic\ncommonsense reasoning. Furthermore, existing methods generally focus only on single-character stories, or fail to track characters\nat all. To improve the coherence of generated narratives and to expand the scope of\ncharacter-centric narrative generation, we introduce Commonsense-inference Augmented\nneural StoryTelling (CAST), a framework for\nintroducing commonsense reasoning into the\ngeneration process with the option to model the\ninteraction between multiple characters. We\nfind that our CAST method produces significantly more coherent, on-topic, enjoyable and\nfluent stories than existing models in both the\nsingle-character and two-character settings in\nthree storytelling domains.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.403098106384277, 7.531744480133057], "id": 4520, "title": "How to Stop an Avalanche? JoDeM: Joint Decision Making through Compare and Contrast for Dialog State Tracking", "authors": "Haoming Wang and Wang Xin", "abstract": "Dialog state tracking (DST) is a core component in task-oriented dialog systems. Existing state-of-the-art DST model incorporates insight and intuition from the human experience into design of supplementary labels, which greatly assisted the training process of turn-by-turn DST model. Though the turn-by-turn scheme and supplementary labels enabled satisfactory performance on the task, most of the DST models of this fashion label or process the raw dialogue data on the premise that the last turn dialogue state is always correct, which is usually not the case. In this paper, we address the negative impact resulted from the premise above as the avalanche phenomenon. After that, we propose JoDeM, a state-of-the-art DST model which can tackle the Avalanche phenomenon with two mechanisms. First mechanism is a jointly decision making method to extract key information from the dialogue. Second mechanism is a compare and contrast dialogue update technique to prevent error accumulation. Example study and graph analysis are presented to support our claim about the harmfulness of avalanche phenomenon. We also conduct quantitative and qualitative experiments on the high quality MultiWOZ2.3 corpus dataset to demonstrate that the proposed model not only outperforms the existing state-of-the-art methods, but also proves the validity of solving avalanche degradation problem.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.023041725158691, 5.7381181716918945], "id": 4522, "title": "Contrastive Learning with Prompt-derived Virtual Semantic Prototypes for Unsupervised Sentence Embedding", "authors": "Jiali Zeng, Yongjing Yin, Yufan Jiang, Shuangzhi Wu and Yunbo Cao", "abstract": "Contrastive learning has become a new paradigm for unsupervised sentence embeddings.\nPrevious studies focus on instance-wise contrastive learning, attempting to construct positive pairs with textual data augmentation. In this paper, we propose a novel Contrastive learning method with Prompt-derived Virtual semantic Prototypes (ConPVP). \nSpecifically, with the help of prompts, we construct virtual semantic prototypes to each instance, and derive negative prototypes by using the negative form of the prompts.\nUsing a prototypical contrastive loss, we enforce the anchor sentence embedding to be close to its corresponding semantic prototypes, and far apart from the negative prototypes as well as the prototypes of other sentences.\nExtensive experimental results on semantic textual similarity, transfer, and clustering tasks demonstrate the effectiveness of our proposed model compared to strong baselines.\nCode is available at https://github.com/lemon0830/promptCSE.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.617175579071045, 3.6496224403381348], "id": 4524, "title": "Weight Perturbation as Defense against Adversarial Word Substitutions", "authors": "Jianhan Xu, Linyang Li, Jiping Zhang, Xiaoqing Zheng, Kai-Wei Chang, Cho-Jui Hsieh and Xuanjing Huang", "abstract": "The existence and pervasiveness of textual adversarial examples have raised serious concerns to security-critical applications. \nMany methods have been developed to defend against adversarial attacks for neural natural language processing (NLP) models.\nAdversarial training is one of the most successful defense methods by adding some random or intentional perturbations to the original input texts and making the models robust to the perturbed examples.\nIn this study, we explore the feasibility of improving the adversarial robustness of NLP models by performing perturbations in the parameter space rather than the input feature space.\nThe weight perturbation helps to find a better solution (i.e., the values of weights) that minimizes the adversarial loss among other feasible solutions.\nWe found that the weight perturbation can significantly improve the robustness of NLP models when it is combined with the perturbation in the input embedding space, yielding the highest accuracy on both clean and adversarial examples across different datasets.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.071527361869812, 10.52360725402832], "id": 4527, "title": "CORT: A New Baseline for Comparative Opinion Classification by Dual Prompts", "authors": "Yequan Wang, Hengran Zhang, Aixin Sun and Xuying Meng", "abstract": "Comparative opinion is a common linguistic phenomenon. The opinion is expressed by comparing multiple targets on a shared aspect, e.g., \"camera A is better than camera B in picture quality\u201d. Among the various subtasks in opinion mining, comparative opinion classification is relatively less studied. Current solutions use rules or classifiers to identify opinions, i.e., better, worse, or same, through feature engineering. Because the features are directly derived from the input sentence, these solutions are sensitive to the order of the targets mentioned in the sentence. For example, \"camera A is better than camera B\u201d means the same as \"camera B is worse than camera A\u201d; but the features of these two sentences are completely different. In this paper, we approach comparative opinion classification through prompt learning, taking the advantage of embedded knowledge in pre-trained language model. We design a twin framework with dual prompts, named CORT. This extremely simple model delivers state-of-the-art and robust performance on all benchmark datasets for comparative opinion classification. We believe CORT well serves as a new baseline for comparative opinion classification.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [7.5784406661987305, 12.384275436401367], "id": 4532, "title": "APEACH: Attacking Pejorative Expressions with Analysis on Crowd-Generated Hate Speech Evaluation Datasets", "authors": "Kichang Yang, Wonjun Jang and Won Ik Cho", "abstract": "In hate speech detection, developing training and evaluation datasets across various domains is the critical issue. Whereas, major approaches crawl social media texts and hire crowd-workers to annotate the data. Following this convention often restricts the scope of pejorative expressions to a single domain lacking generalization. Sometimes domain overlap between training corpus and evaluation set overestimate the prediction performance when pretraining language models on low-data language. To alleviate these problems in Korean, we propose APEACH that asks unspecified users to generate hate speech examples followed by minimal post-labeling. We find that APEACH can collect useful datasets that are less sensitive to the lexical overlaps between the pretraining corpus and the evaluation set, thereby properly measuring the model performance.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.6500139236450195, 9.146406173706055], "id": 4533, "title": "Guiding Neural Story Generation with Reader Models", "authors": "Xiangyu Peng, Kaige Xie, Amal Alabdulkarim, Harshith Kayam, Samihan Dani and Mark Riedl", "abstract": "Automated storytelling has long captured the attention of researchers for the ubiquity of narratives in everyday life. However, it is challenging to maintain coherence and stay on-topic\ntoward a specific ending when generating narratives with neural language models. In this paper, we introduce Story generation with Reader\nModels (StoRM), a framework in which a\nreader model is used to reason about the story\nshould progress. A reader model infers what\na human reader believes about the concepts,\nentities, and relations about the fictional story\nworld. We show how an explicit reader model\nrepresented as a knowledge graph affords the story\ncoherence and provides controllability in the\nform of achieving a given story world state\ngoal. Experiments show that our model produces significantly more coherent and on-topic\nstories, outperforming baselines in dimensions\nincluding plot plausibility and staying on topic", "track": "Natural Language Generation", "label": 6}, {"loc": [4.0919718742370605, 7.065012454986572], "id": 4539, "title": "Reason first, then respond: Modular Generation for Knowledge-infused Dialogue", "authors": "Leonard Adolphs, Kurt Shuster, Jack Urbanek, Arthur Szlam and Jason Weston", "abstract": "Large language models can produce fluent dialogue but often hallucinate factual inaccuracies. While retrieval-augmented models help alleviate this issue, they still face a difficult challenge of both reasoning to provide correct knowledge and generating conversation simultaneously. In this work, we propose a modular model, Knowledge to Response (K2R), for incorporating knowledge into conversational agents, which breaks down this problem into two easier steps. K2R first generates a knowledge sequence, given a dialogue context, as an intermediate step. After this \"reasoning step\", the model then attends to its own generated knowledge sequence, as well as the dialogue context, to produce a final response. In detailed experiments, we find that such a model hallucinates less in knowledge-grounded dialogue tasks, and has advantages in terms of interpretability and modularity. In particular, it can be used to fuse QA and dialogue systems together to enable dialogue agents to give knowledgeable answers, or QA models to give conversational responses in a zero-shot setting.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.931356430053711, 6.445319175720215], "id": 4540, "title": "Adapting Multilingual Models for Code-Mixed Translation", "authors": "Aditya Vavre, Abhirut Gupta and Sunita Sarawagi", "abstract": "The scarcity of gold standard code-mixed to pure language parallel data makes it difficult to train translation models reliably.\nPrior work has addressed the paucity of parallel data with data augmentation techniques.\nSuch methods rely heavily on external resources making systems difficult to train and scale effectively for multiple languages.\nWe present a simple yet highly effective two-stage back-translation based training scheme for adapting multilingual models to the task of code-mixed translation which eliminates dependence on external resources.\nWe show a substantial improvement in translation quality (measured through BLEU), beating existing prior work by up to +3.8 BLEU on code-mixed Hi$\\rightarrow$En, Mr$\\rightarrow$En, and Bn$\\rightarrow$En tasks. On the LinCE Machine Translation leader board, we achieve the highest score for code-mixed Es$\\rightarrow$En, beating existing best baseline by +6.5 BLEU, and our own stronger baseline by +1.1 BLEU.", "track": "Machine Translation", "label": 10}, {"loc": [8.681415557861328, 8.381829261779785], "id": 4543, "title": "LPC: A Logits and Parameter Calibration Framework for Continual Learning", "authors": "Xiaodi Li, Zhuoyi Wang, Dingcheng Li, Latifur Khan and Bhavani Thuraisingham", "abstract": "When we execute the typical fine-tuning paradigm on continuously sequential tasks, the model will suffer from the catastrophic forgetting problem (i.e., the model tends to adjust old parameters according to the new knowledge, which leads to the loss of previously acquired concepts). People proposed replay-based methods by accessing old data from extra storage and maintaining the parameters of old concepts, which actually raise the privacy issue and larger memory requirements. In this work, we aim to achieve the sequential/continual learning of knowledge without accessing the old data. The core idea is to calibrate the parameters and logits (output) so that preserving old parameters and generalized learning on new concepts can be solved simultaneously. Our proposed framework includes two major components, Logits Calibration (LC) and Parameter Calibration (PC). The LC focuses on calibrating the learning of novel models with old models, and PC aims to preserve the parameters of old models. These two operations can maintain the old knowledge while learning new tasks without storing previous data. We conduct experiments on various scenarios of the GLUE (the General Language Understanding Evaluation) benchmark. The experimental results show that our model achieves state-of-the-art performance in all scenarios.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.956563949584961, 6.496481418609619], "id": 4547, "title": "SlovakBERT: Slovak Masked Language Model", "authors": "Mat\u00fa\u0161 Pikuliak, \u0160tefan Grivalsk\u00fd, Martin Kon\u00f4pka, Miroslav Bl\u0161t\u00e1k, Martin Tamajka, Viktor Bachrat\u00fd, Marian Simko, Pavol Bal\u00e1\u017eik, Michal Trnka and Filip Uhl\u00e1rik", "abstract": "We introduce a new Slovak masked language model called \\textit{SlovakBERT}. This is to our best knowledge the first paper discussing Slovak transformers-based language models. We evaluate our model on several NLP tasks and achieve state-of-the-art results. This evaluation is likewise the first attempt to establish a benchmark for Slovak language models. We publish the masked language model, as well as the fine-tuned models for part-of-speech tagging, sentiment analysis and semantic textual similarity.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.076018810272217, 7.563241481781006], "id": 4552, "title": "Efficient Zero-shot Event Extraction with Context-Definition Alignment", "authors": "Hongming Zhang, Wenlin Yao and Dong Yu", "abstract": "Event extraction (EE) is the task of identifying interested event mentions from text.\nConventional efforts mainly focus on the supervised setting. However, these supervised models cannot generalize to event types out of the pre-defined ontology. To fill this gap, many efforts have been devoted to the zero-shot EE problem. This paper follows the trend of modeling event-type semantics but moves one step further. We argue that using the static embedding of the event type name might not be enough because a single word could be ambiguous, and we need a sentence to define the type semantics accurately. To model the definition semantics, we use two separate transformer models to project the contextualized event mentions and corresponding definitions into the same embedding space and then minimize their embedding distance via contrastive learning. On top of that, we also propose a warming phase to help the model learn the minor difference between similar definitions. We name our approach Zero-shot Event extraction with Definition (ZED). Experiments on the MAVEN dataset show that our model significantly outperforms all previous zero-shot EE methods with fast inference speed due to the disjoint design. Further experiments also show that \\ModelName~can be easily applied to the few-shot setting when the annotation is available and consistently outperforms baseline supervised methods.", "track": "Information Extraction", "label": 5}, {"loc": [4.441400051116943, 5.304572582244873], "id": 4555, "title": "Logical Fallacy Detection", "authors": "Zhijing Jin, Abhinav Lalwani, Tejas Vaidhya, Xiaoyu Shen, Yiwen Ding, Zhiheng LYU, Mrinmaya Sachan, Rada Mihalcea and Bernhard Schoelkopf", "abstract": "Reasoning is central to human intelligence. However, fallacious arguments are common, and some exacerbate problems such as spreading misinformation about climate change. In this paper, we propose the task of logical fallacy detection, and provide a new dataset (Logic) of logical fallacies generally found in text, together with an additional challenge set for detecting logical fallacies in climate change claims (LogicClimate). Detecting logical fallacies is a hard problem as the model must understand the underlying logical structure of the argument. We find that existing pretrained large language models perform poorly on this task. In contrast, we show that a simple structure-aware classifier outperforms the best language model by 5.46% F1 scores on Logic and 4.51% on LogicClimate. We encourage future work to explore this task since (a) it can serve as a new reasoning challenge for language models, and (b) it can have potential applications in tackling the spread of misinformation. Our dataset and code are available at https://\ngithub.com/causalNLP/logical-fallacy", "track": "NLP Applications", "label": 0}, {"loc": [4.09990930557251, 7.124439716339111], "id": 4561, "title": "Topic-Aware Response Generation in Task-Oriented Dialogue with Unstructured Knowledge Access", "authors": "Yue Feng, Gerasimos Lampouras and Ignacio Iacobacci", "abstract": "To alleviate the problem of structured databases' limited coverage, recent task-oriented dialogue systems incorporate external unstructured knowledge to guide the generation of system responses. However, these usually use word or sentence level similarities to detect the relevant knowledge context, which only partially capture the topical level relevance. In this paper, we examine how to better integrate topical information in knowledge grounded task-oriented dialogue and propose \"Topic-Aware Response Generation\" (TARG), an end-to-end response generation model. TARG incorporates multiple topic-aware attention mechanisms to derive the importance weighting scheme over dialogue utterances and external knowledge sources towards a better understanding of the dialogue history. Experimental results indicate that TARG achieves state-of-the-art performance in knowledge selection and response generation, outperforming previous state-of-the-art by 3.2, 3.6, and 4.2 points in EM, F1 and BLEU-4 respectively on Doc2Dial, and performing comparably with previous work on DSTC9; both being knowledge-grounded task-oriented dialogue datasets.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.272808074951172, 8.017844200134277], "id": 4563, "title": "Revisiting Transformer-based Models for Long Document Classification", "authors": "Xiang Dai, Ilias Chalkidis, Sune Darkner and Desmond Elliott", "abstract": "The recent literature in text classification is biased towards short text sequences (e.g., sentences or paragraphs). \nIn real-world applications, multi-page multi-paragraph documents are common and they cannot be efficiently encoded by vanilla Transformer-based models. \nWe compare different Transformer-based Long Document Classification (TrLDC) approaches that aim to mitigate the computational overhead of vanilla transformers to encode much longer text, namely sparse attention and hierarchical encoding methods.\nWe examine several aspects of sparse attention (e.g., size of local attention window, use of global attention) and hierarchical (e.g., document splitting strategy) transformers on four document classification datasets covering different domains. \nWe observe a clear benefit from being able to process longer text, and, based on our results, we derive practical advice of applying Transformer-based models on long document classification tasks.", "track": "NLP Applications", "label": 0}, {"loc": [5.274668216705322, 9.11607551574707], "id": 4568, "title": "Time-aware Prompting for Text Generation", "authors": "Shuyang Cao and Lu Wang", "abstract": "In this paper, we study the effects of incorporating timestamps, such as document creation dates, into generation systems. Two types of time-aware prompts are investigated: (1) textual prompts that encode document timestamps in natural language sentences; and (2) linear prompts that convert timestamps into continuous vectors. To explore extrapolation to future data points, we further introduce a new data-to-text generation dataset, TempWikiBio, containing more than 4 millions of chronologically ordered revisions of biographical articles from English Wikipedia, each paired with structured personal profiles.\nThrough data-to-text generation on TempWikiBio, text-to-text generation on the content transfer dataset, and summarization on XSum,\nwe show that linear prompts on encoder and textual prompts improve the generation quality on all datasets.\nDespite having less performance drop when testing on data drawn from a later time, linear prompts focus more on non-temporal information and are less sensitive to the given timestamps, according to human evaluations and sensitivity analyses.\nMeanwhile, textual prompts establish the association between the given timestamps and the output dates, yielding more factual temporal information in the output.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.11783218383789, 7.202850818634033], "id": 4572, "title": "Improving Scheduled Sampling with Elastic Weight Consolidation for Neural Machine Translation", "authors": "Michalis Korakakis and Andreas Vlachos", "abstract": "Despite strong performance in many sequence-to-sequence tasks, autoregressive models trained with maximum likelihood estimation suffer from exposure bias, i.e. the discrepancy between the ground-truth prefixes used during training and the model-generated prefixes used at inference time. Scheduled sampling is a simple and empirically successful approach which addresses this issue by incorporating model-generated prefixes into training. However, it has been argued that it is an inconsistent training objective leading to models ignoring the prefixes altogether. In this paper, we conduct systematic experiments and find that scheduled sampling, while it ameliorates exposure bias by increasing model reliance on the input sequence, worsens performance when the prefix at inference time is correct, a form of catastrophic forgetting. We propose to use Elastic Weight Consolidation to better balance mitigating exposure bias with retaining performance. Experiments on four IWSLT'14 and WMT'14 translation datasets demonstrate that our approach alleviates catastrophic forgetting and significantly outperforms maximum likelihood estimation and scheduled sampling baselines.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [2.9034206867218018, 4.580212593078613], "id": 4585, "title": "Ensemble Transformer for Efficient and Accurate Ranking Tasks: an Application to Question Answering Systems", "authors": "Yoshitomo Matsubara, Luca Soldaini, Eric Lind and Alessandro Moschitti", "abstract": "Large transformer models can highly improve Answer Sentence Selection (AS2) tasks, but their high computational costs prevent their use in many real-world applications. In this paper, we explore the following research question: How can we make the AS2 models more accurate without significantly increasing their model complexity? To address the question, we propose a Multiple Heads Student architecture (named CERBERUS), an efficient neural network designed to distill an ensemble of large transformers into a single smaller model. CERBERUS consists of two components: a stack of transformer layers that is used to encode inputs, and a set of ranking heads; unlike traditional distillation technique, each of them is trained by distilling a different large transformer architecture in a way that preserves the diversity of the ensemble members. The resulting model captures the knowledge of heterogeneous transformer models by using just a few extra parameters. We show the effectiveness of CERBERUS on three English datasets for AS2; our proposed approach outperforms all single-model distillations we consider, rivaling the state-of-the-art large AS2 models that have 2.7\u00d7 more parameters and run 2.5\u00d7 slower. Code for our model is available at https://github.com/amazon-research/wqa-cerberus.", "track": "Question Answering", "label": 11}, {"loc": [8.425239562988281, 7.632030010223389], "id": 4587, "title": "Uncertainty Quantification with Pre-trained Language Models: A Large-Scale Empirical Analysis", "authors": "Yuxin Xiao, Paul Pu Liang, Umang Bhatt, Willie Neiswanger, Ruslan Salakhutdinov and Louis-Philippe Morency", "abstract": "Pre-trained language models (PLMs) have gained increasing popularity due to their compelling prediction performance in diverse natural language processing (NLP) tasks. When formulating a PLM-based prediction pipeline for NLP tasks, it is also crucial for the pipeline to minimize the calibration error, especially in safety-critical applications. That is, the pipeline should reliably indicate when we can trust its predictions. In particular, there are various considerations behind the pipeline: (1) the choice and (2) the size of PLM, (3) the choice of uncertainty quantifier, (4) the choice of fine-tuning loss, and many more. Although prior work has looked into some of these considerations, they usually draw conclusions based on a limited scope of empirical studies. There still lacks a holistic analysis on how to compose a well-calibrated PLM-based prediction pipeline. To fill this void, we compare a wide range of popular options for each consideration based on three prevalent NLP classification tasks and the setting of domain shift. In response, we recommend the following: (1) use ELECTRA for PLM encoding, (2) use larger PLMs if possible, (3) use Temp Scaling as the uncertainty quantifier, and (4) use Focal Loss for fine-tuning.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.020224094390869, 7.227565765380859], "id": 4590, "title": "How to Represent Context Better? An Empirical Study on Context Modeling for Multi-turn Response Selection", "authors": "Jiazhan Feng, Chongyang Tao, Chang Liu, Rui Yan and Dongyan Zhao", "abstract": "Building retrieval-based dialogue models that can predict appropriate responses based on the understanding of multi-turn context messages is a challenging problem. Early models usually concatenate all utterances or independently encode each dialogue turn, which may lead to an inadequate understanding of dialogue status. Although a few researchers have noticed the importance of context modeling in multi-turn response prediction, there is no systematic comparison to analyze how to model context effectively and no framework to unify those methods. In this paper, instead of configuring new architectures, we investigate how to improve existing models with a better context modeling method. Specifically, we heuristically summarize three categories of turn-aware context modeling strategies which model the context messages from the perspective of sequential relationship, local relationship, and query-aware manner respectively. A Turn-Aware Context Modeling (TACM) layer is explored to flexibly adapt and unify these context modeling strategies to several advanced response selection models. Evaluation results on three public data sets indicate that employing each individual context modeling strategy or multiple strategies can consistently improve the performance of existing models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.56942367553711, 6.848044395446777], "id": 4591, "title": "CHIA: CHoosing Instances to Annotate for Machine Translation", "authors": "Rajat Bhatnagar, Ananya Ganesh and Katharina Kann", "abstract": "Neural machine translation (MT) systems have been shown to perform poorly on low-resource language pairs, for which large-scale parallel data is unavailable. Making the data annotation process faster and cheaper is therefore important to ensure equitable access to MT systems. To make optimal use of a limited annotation budget, we present CHIA (choosing instances to annotate), a method for selecting instances to annotate for machine translation. Using an existing multi-way parallel dataset of high-resource languages, we first identify instances, based on model training dynamics, that are most informative for training MT models for high-resource languages. \nWe find that there are cross-lingual commonalities in instances that are useful for MT model training, which we use to identify instances that will be useful to train models on a new target language. Evaluating on 20 languages from two corpora, we show that training on instances selected using our method provides an average performance improvement of 1.59 BLEU over training on randomly selected instances of the same size.", "track": "Machine Translation", "label": 10}, {"loc": [10.577953338623047, 7.187368392944336], "id": 4596, "title": "Guiding Neural Machine Translation with Semantic Kernels", "authors": "Ping Guo, Yue Hu, Xiangpeng Wei, Yubing Ren, Yunpeng Li, Luxi Xing and Yuqiang Xie", "abstract": "Machine Translation task has made great progress with the help of auto-regressive decoding paradigm and Transformer architecture. In this paradigm, though the encoder can obtain global source representations, the decoder can only use translation history to determine the current word. Previous promising works attempted to address this issue by applying a draft or a fixed-length semantic embedding as target-side global information. However, these methods either degrade model efficiency or show limitations in expressing semantics. Motivated by Functional Equivalence Theory, we extract several semantic kernels from a source sentence, each of which can express one semantic segment of the original sentence. Together, these semantic kernels can capture global semantic information, and we project them into target embedding space to guide target sentence generation. We further force our model to use semantic kernels at each decoding step through an adaptive mask algorithm. Empirical studies on various machine translation benchmarks show that our approach gains approximately an improvement of 1 BLEU score on most benchmarks over the Transformer baseline and about 1.7 times faster than previous works on average at inference time.", "track": "Machine Translation", "label": 10}, {"loc": [0.7051019072532654, 6.476152420043945], "id": 4597, "title": "HiSMatch: Historical Structure Matching based Temporal Knowledge Graph Reasoning", "authors": "Zixuan Li, Zhongni Hou, Saiping Guan, Xiaolong Jin, Weihua Peng, Long Bai, Yajuan Lyu, Wei Li, Jiafeng Guo and Xueqi Cheng", "abstract": "A Temporal Knowledge Graph (TKG) is a sequence of KGs with respective timestamps, which adopts quadruples in the form of (\\emph{subject}, \\emph{relation}, \\emph{object}, \\emph{timestamp}) to describe dynamic facts. TKG reasoning has facilitated many real-world applications via answering such queries as (\\emph{query entity}, \\emph{query relation}, \\emph{?}, \\emph{future timestamp}) about future. This is actually a matching task between a query and candidate entities based on their historical structures, which reflect behavioral trends of the entities at different timestamps. In addition, recent KGs provide background knowledge of all the entities, which is also helpful for the matching. Thus, in this paper, we propose the \\textbf{Hi}storical \\textbf{S}tructure \\textbf{Match}ing (\\textbf{HiSMatch}) model. It applies two structure encoders to capture the semantic information contained in the historical structures of the query and candidate entities. Besides, it adopts another encoder to integrate the background knowledge into the model. TKG reasoning experiments on six benchmark datasets demonstrate the significant improvement of the proposed HiSMatch model, with up to 5.6\\% performance improvement in MRR, compared to the state-of-the-art baselines.", "track": "Information Extraction", "label": 5}, {"loc": [8.211596488952637, 6.54801082611084], "id": 4606, "title": "Dependency Parsing via Sequence Generation", "authors": "Boda Lin, Zijun Yao, Jiaxin Shi, Shulin Cao, Binghao Tang, Si Li, Yong Luo, Juanzi Li and Lei Hou", "abstract": "Dependency parsing aims to extract syntactic dependency structure or semantic dependency structure for sentences.\nExisting methods for dependency parsing include transition-based method, graph-based method and sequence-to-sequence method.\nThese methods obtain excellent performance and we notice them belong to labeling method.\nTherefore, it may be very valuable and interesting to explore the possibility of using generative method to implement dependency parsing.\nIn this paper, we propose to achieve Dependency Parsing (DP) via Sequence Generation (SG) by utilizing only the pre-trained language model without any auxiliary structures.\nWe first explore different serialization designing strategies for converting parsing structures into sequences.\nThen we design dependency units and concatenate these units into the sequence for DPSG.\nWe verify the DPSG is capable of parsing on widely used DP benchmarks, i.e., PTB, UD2.2, SDP15 and SemEval16.\nIn addition, we also investigate the astonishing low-resource applicability of DPSG, which includes unsupervised cross-domain conducted on CODT and few-shot cross-task conducted on SDP15.\nOur research demonstrates that sequence generation is one of the effective methods to achieve dependency parsing.\nOur codes are available now.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [8.511802673339844, 8.262737274169922], "id": 4609, "title": "Scaling Laws Under the Microscope: Predicting Transformer Performance from Small Scale Experiments", "authors": "Maor Ivgi, Yair Carmon and Jonathan Berant", "abstract": "Neural scaling laws define a predictable relationship between a model's parameter count and its performance after training in the form of a power law. However, most research to date has not explicitly investigated whether scaling laws can be used to accelerate model development. \nIn this work, we perform such an empirical investigation across a wide range of language understanding tasks, starting from models with as few as 10K parameters, and evaluate downstream performance across 9 language understanding tasks.\nWe find that scaling laws emerge at finetuning time in some NLP tasks, and that they can also be exploited for debugging convergence when training large models. Moreover, for tasks where scaling laws exist, they can be used to predict the performance of larger models, which enables effective model selection. However, revealing scaling laws\nrequires careful hyperparameter tuning and multiple runs for the purpose of uncertainty estimation, which incurs additional overhead, partially offsetting the computational benefits.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [5.97166109085083, 5.87507963180542], "id": 4619, "title": "Analyzing the Limits of Self-Supervision in Handling Bias in Language", "authors": "Lisa Bauer, Karthik Gopalakrishnan, Spandana Gella, Yang Liu, Mohit Bansal and Dilek Hakkani-Tur", "abstract": "Prompting inputs with natural language task descriptions has emerged as a popular mechanism to elicit reasonably accurate outputs from large-scale generative language models with little to no in-context supervision. This also helps gain insight into how well language models capture the semantics of a wide range of downstream tasks purely from self-supervised pre-training on massive corpora of unlabeled text. Such models have naturally also been exposed to a lot of undesirable content like racist and sexist language and there is only some work on awareness of models along these dimensions. In this paper, we define and comprehensively evaluate how well such language models capture the semantics of four tasks for bias: diagnosis, identification, extraction and rephrasing. We define three broad classes of task descriptions for these tasks: statement, question, and completion, with numerous lexical variants within each class. We study the efficacy of prompting for each task using these classes and the null task description across several decoding methods and few-shot examples. Our analyses indicate that language models are capable of performing these tasks to widely varying degrees across different bias dimensions, such as gender and political affiliation. We believe our work is an important step towards unbiased language models by quantifying the limits of current self-supervision objectives at accomplishing such sociologically challenging tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.5991668701171875, 12.304539680480957], "id": 4623, "title": "Multiple Instance Learning for Offensive Language Detection", "authors": "Jiexi Liu, Dehan Kong, Longtao Huang, Dinghui Mao and hui xue", "abstract": "Automatic offensive language detection has become a crucial issue in recent years. Existing researches on this topic are usually based on a large amount of data annotated at sentence level to train a robust model. However, sentence-level annotations are expensive in practice as the scenario expands, while there exist a large amount of natural labels from historical information on online platforms such as reports and punishments. Notably, these natural labels are usually in bag-level corresponding to the whole documents (articles, user profiles, conversations, etc.). Therefore, we target at proposing an approach capable of utilizing the bag-level labeled data for offensive language detection in this study. For this purpose, we formalize this task into a multiple instance learning (MIL) problem. We break down the design of existing MIL methods and propose a hybrid fusion MIL model with mutual-attention mechanism. In order to verify the validity of the proposed method, we present two new bag-level labeled datasets for offensive language detection: OLID-bags and MINOR. Experimental results based on the proposed datasets demonstrate the effectiveness of the mutual-attention method at both sentence level and bag level.", "track": "Ethics", "label": 21}, {"loc": [5.506701946258545, 8.555914878845215], "id": 4626, "title": "Grounded Keys-to-Text Generation: Towards Factual Open-Ended Generation", "authors": "Faeze Brahman, Baolin Peng, Michel Galley, Sudha Rao, Bill Dolan, Snigdha Chaturvedi and Jianfeng Gao", "abstract": "Large pre-trained language models have recently enabled open-ended generation frameworks (e.g., prompt-to-text NLG) to tackle a variety of tasks going beyond the traditional data-to-text generation. While this framework is more general, it is under-specified and often leads to a lack of controllability restricting their real-world usage. We propose a new grounded keys-to-text generation task: the task is to generate a factual description about an entity given a set of guiding keys, and grounding passages. To address this task, we introduce a new dataset, called EntDeGen. Inspired by recent QA-based evaluation measures, we propose an automatic metric, MAFE, for factual correctness of generated descriptions. Our EntDescriptor model is equipped with strong rankers to fetch helpful passages and generate entity descriptions. Experimental result shows a good correlation (60.14) between our proposed metric and human judgments of factuality. Our rankers significantly improved the factual correctness of generated descriptions (15.95% and 34.51% relative gains in recall and precision). Finally, our ablation study highlights the benefit of combining keys and groundings.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.6820216774940491, 6.776662349700928], "id": 123, "title": "Generative Knowledge Graph Construction: A Review", "authors": "Hongbin Ye, Ningyu Zhang, Hui Chen and Huajun Chen", "abstract": "Generative Knowledge Graph Construction (KGC) refers to those methods that leverage the sequence-to-sequence framework for building knowledge graphs, which is flexible and can be adapted to widespread tasks. In this study, we summarize the recent compelling progress in generative knowledge graph construction. We present the advantages and weaknesses of each paradigm in terms of different generation targets and provide theoretical insight and empirical analysis. Based on the review, we suggest promising research directions for the future. Our contributions are threefold: (1) We present a detailed, complete taxonomy for the generative KGC methods; (2) We provide a theoretical and empirical analysis of the generative KGC methods; (3) We propose several research directions that can be developed in the future.", "track": "Information Extraction", "label": 5}, {"loc": [4.255823135375977, 7.2021002769470215], "id": 142, "title": "CDConv: A Benchmark for Contradiction Detection in Chinese Conversations", "authors": "Chujie Zheng, Jinfeng Zhou, Yinhe Zheng, Libiao Peng, Zhen Guo, Wenquan Wu, Zheng-Yu Niu, Hua Wu and Minlie Huang", "abstract": "Dialogue contradiction is a critical issue in open-domain dialogue systems. The contextualization nature of conversations makes dialogue contradiction detection rather challenging. In this work, we propose a benchmark for Contradiction Detection in Chinese Conversations, namely CDConv. It contains 12K multi-turn conversations annotated with three typical contradiction categories: Intra-sentence Contradiction, Role Confusion, and History Contradiction. To efficiently construct the CDConv conversations, we devise a series of methods for automatic conversation generation, which simulate common user behaviors that trigger chatbots to make contradictions. We conduct careful manual quality screening of the constructed conversations and show that state-of-the-art Chinese chatbots can be easily goaded into making contradictions. Experiments on CDConv show that properly modeling contextual information is critical for dialogue contradiction detection, but there are still unresolved challenges that require future research.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.583722114562988, 7.876671314239502], "id": 164, "title": "Transformer Feed-Forward Layers Build Predictions by Promoting Concepts in the Vocabulary Space", "authors": "Mor Geva, Avi Caciularu, Kevin Wang and Yoav Goldberg", "abstract": "Transformer-based language models (LMs) are at the core of modern NLP, but their internal prediction construction process is opaque and largely not understood. In this work, we make a substantial step towards unveiling this underlying prediction process, by reverse-engineering the operation of the feed-forward network (FFN) layers, one of the building blocks of transformer models. We view the token representation as a changing distribution over the vocabulary, and the output from each FFN layer as an additive update to that distribution. Then, we analyze the FFN updates in the vocabulary space, showing that each update can be decomposed to sub-updates corresponding to single FFN parameter vectors, each promoting concepts that are often human-interpretable. We then leverage these findings for controlling LM predictions, where we reduce the toxicity of GPT2 by almost 50%, and for improving computation efficiency with a simple early exit rule, saving 20% of computation on average.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.9749038219451904, 4.687404632568359], "id": 168, "title": "Learning to Generate Question by Asking Question: A Primal-Dual Approach with Uncommon Word Generation", "authors": "Qifan Wang, Li Yang, Xiaojun Quan, Fuli Feng, Dongfang Liu, Zenglin Xu, Sinong Wang and Hao Ma", "abstract": "Automatic question generation (AQG) is the task of generating a question from a given passage and an answer. Most existing AQG methods aim at encoding the passage and the answer to generate the question. However, limited work has focused on modeling the correlation between the target answer and the generated question. Moreover, unseen or rare word generation has not been studied in previous works. In this paper, we propose a novel approach which incorporates question generation with its dual problem, question answering, into a unified primal-dual framework. Specifically, the question generation component consists of an encoder that jointly encodes the answer with the passage, and a decoder that produces the question. The question answering component then re-asks the generated question on the passage to ensure that the target answer is obtained. We further introduce a knowledge distillation module to improve the model generalization ability. We conduct an extensive set of experiments on SQuAD and HotpotQA benchmarks. Experimental results demonstrate the superior performance of the proposed approach over several state-of-the-art methods.", "track": "NLP Applications", "label": 0}, {"loc": [0.7862079739570618, 7.540556907653809], "id": 236, "title": "Graph-based Model Generation for Few-Shot Relation Extraction", "authors": "Wanli Li and Tieyun Qian", "abstract": "Few-shot relation extraction (FSRE) has been a challenging problem since it only has a handful of training instances. Existing models follow a `one-for-all' scheme where one general large model performs all individual N-way-K-shot tasks in FSRE, which prevents the model from achieving the optimal point on each task. In view of this, we propose a model generation framework that consists of one general model for all tasks and many tiny task-specific models for each individual task. The general model generates and passes the universal knowledge to the tiny models which will be further fine-tuned when performing specific tasks. In this way, we decouple the complexity of the entire task space from that of all individual tasks while absorbing the universal knowledge.\nExtensive experimental results on two public datasets demonstrate that our framework reaches a new state-of-the-art performance for FRSE tasks. Our code is available at: https://github.com/NLPWM-WHU/GM_GEN.", "track": "Information Extraction", "label": 5}, {"loc": [8.076251029968262, 3.094322681427002], "id": 264, "title": "Backdoor Attacks in Federated Learning by Rare Embeddings and Gradient Ensembling", "authors": "Ki Yoon Yoo and Nojun Kwak", "abstract": "Recent advances in federated learning have demonstrated its promising capability to learn on decentralized datasets. However, a considerable amount of work has raised concerns due to the potential risks of adversaries participating in the framework to poison the global model for an adversarial purpose. This paper investigates the feasibility of model poisoning for backdoor attacks through rare word embeddings of NLP models. In text classification, less than 1% of adversary clients suffices to manipulate the model output without any drop in the performance of clean sentences. For a less complex dataset, a mere 0.1% of adversary clients is enough to poison the global model effectively. We also propose a technique specialized in the federated learning scheme called gradient ensemble, which enhances the backdoor performance in all experimental settings.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.497512340545654, 4.525813102722168], "id": 266, "title": "Generating Natural Language Proofs with Verifier-Guided Search", "authors": "Kaiyu Yang, Jia Deng and Danqi Chen", "abstract": "Reasoning over natural language is a challenging problem in NLP. In this work, we focus on proof generation: Given a hypothesis and a set of supporting facts, the model generates a proof tree indicating how to derive the hypothesis from supporting facts. Compared to generating the entire proof in one shot, stepwise generation can better exploit the compositionality and generalize to longer proofs but has achieved limited success on real-world data. Existing stepwise methods struggle to generate proof steps that are both logically valid and relevant to the hypothesis. Instead, they tend to hallucinate invalid steps given the hypothesis. In this paper, we present a novel stepwise method, NLProofS (Natural Language Proof Search), which learns to generate relevant steps conditioning on the hypothesis. At the core of our approach, we train an independent verifier to check the validity of the proof steps to prevent hallucination. Instead of generating steps greedily, we search for proofs maximizing a global proof score judged by the verifier. NLProofS achieves state-of-the-art performance on EntailmentBank and RuleTaker. Specifically, it improves the correctness of predicted proofs from 27.7% to 33.3% in the distractor setting of EntailmentBank, demonstrating the effectiveness of NLProofS in generating challenging human-authored proofs.", "track": "Question Answering", "label": 11}, {"loc": [3.86320424079895, 9.9197359085083], "id": 280, "title": "Toward Unifying Text Segmentation and Long Document Summarization", "authors": "Sangwoo Cho, Kaiqiang Song, Xiaoyang Wang, Fei Liu and Dong Yu", "abstract": "Text segmentation is important for signaling a document's structure. Without segmenting a long document into topically coherent sections, it is difficult for readers to comprehend the text, let alone find important information. The problem is only exacerbated by a lack of segmentation in transcripts of audio/video recordings. In this paper, we explore the role that section segmentation plays in extractive summarization of written and spoken documents. Our approach learns robust sentence representations by performing summarization and segmentation simultaneously, which is further enhanced by an optimization-based regularizer to promote selection of diverse summary sentences. We conduct experiments on multiple datasets ranging from scientific articles to spoken transcripts to evaluate the model's performance. Our findings suggest that the model can not only achieve state-of-the-art performance on publicly available benchmarks, but demonstrate better cross-genre transferability when equipped with text segmentation. We perform a series of analyses to quantify the impact of section segmentation on summarizing written and spoken documents of substantial length and complexity.", "track": "Summarization", "label": 14}, {"loc": [9.092198371887207, 6.21954870223999], "id": 307, "title": "The Geometry of Multilingual Language Model Representations", "authors": "Tyler Chang, Zhuowen Tu and Benjamin Bergen", "abstract": "We assess how multilingual language models maintain a shared multilingual representation space while still encoding language-sensitive information in each language. Using XLM-R as a case study, we show that languages occupy similar linear subspaces after mean-centering, evaluated based on causal effects on language modeling performance and direct comparisons between subspaces for 88 languages. The subspace means differ along language-sensitive axes that are relatively stable throughout middle layers, and these axes encode information such as token vocabularies. Shifting representations by language means is sufficient to induce token predictions in different languages. However, we also identify stable language-neutral axes that encode information such as token positions and part-of-speech. We visualize representations projected onto language-sensitive and language-neutral axes, identifying language family and part-of-speech clusters, along with spirals, toruses, and curves representing token position information. These results demonstrate that multilingual language models encode information along orthogonal language-sensitive and language-neutral axes, allowing the models to extract a variety of features for downstream tasks and cross-lingual transfer learning.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.8296313285827637, 4.764945030212402], "id": 321, "title": "Improving Complex Knowledge Base Question Answering via Question-to-Action and Question-to-Question Alignment", "authors": "Yechun Tang, Xiaoxia Cheng and Weiming Lu", "abstract": "Complex knowledge base question answering can be achieved by converting questions into sequences of predefined actions. However, there is a significant semantic and structural gap between natural language and action sequences, which makes this conversion difficult. In this paper, we introduce an alignment-enhanced complex question answering framework, called ALCQA, which mitigates this gap through question-to-action alignment and question-to-question alignment. We train a question rewriting model to align the question and each action, and utilize a pretrained language model to implicitly align the question and KG artifacts. Moreover, considering that similar questions correspond to similar action sequences, we retrieve top-k similar question-answer pairs at the inference stage through question-to-question alignment and propose a novel reward-guided action sequence selection strategy to select from candidate action sequences. We conduct experiments on CQA and WQSP datasets, and the results show that our approach outperforms state-of-the-art methods and obtains a 9.88\\% improvements in the F1 metric on CQA dataset. Our source code is available at \\url{\nhttps://github.com/TTTTTTTTy/ALCQA}.", "track": "Question Answering", "label": 11}, {"loc": [4.672688961029053, 6.941009998321533], "id": 375, "title": "PAIR: Prompt-Aware margIn Ranking for Counselor Reflection Scoring in Motivational Interviewing", "authors": "Do June Min, Ver\u00f3nica P\u00e9rez-Rosas, Kenneth Resnicow and Rada Mihalcea", "abstract": "Counselor reflection is a core verbal skill used by mental health counselors to express understanding and affirmation of the client's experience and concerns. In this paper, we propose a system for the analysis of counselor reflections. Specifically, our system takes as input one dialog turn containing a client prompt and a counselor response, and outputs a score indicating the level of reflection in the counselor response. We compile a dataset consisting of different levels of reflective listening skills, and propose the Prompt-Aware margIn Ranking (PAIR) framework that contrasts positive and negative prompt and response pairs using specially designed multi-gap and prompt-aware margin ranking losses. Through empirical evaluations and deployment of our system in a real-life educational environment, we show that our analysis model outperforms several baselines on different metrics, and can be used to provide useful feedback to counseling trainees.", "track": "Ethic Concerns:NLP Applications", "label": 0}, {"loc": [6.913618564605713, 9.56233024597168], "id": 376, "title": "Co-guiding Net: Achieving Mutual Guidances between Multiple Intent Detection and Slot Filling via Heterogeneous Semantics-Label Graphs", "authors": "Bowen Xing and Ivor Tsang", "abstract": "Recent graph-based models for joint multiple intent detection and slot filling have obtained promising results through modeling the guidance from the prediction of intents to the decoding of slot filling.\nHowever, existing methods (1) only model the \\textit{unidirectional guidance} from intent to slot; (2) adopt \\textit{homogeneous graphs} to model the interactions between the slot semantics nodes and intent label nodes, which limit the performance.\nIn this paper, we propose a novel model termed Co-guiding Net, which implements a two-stage framework achieving the \\textit{mutual guidances} between the two tasks.\nIn the first stage, the initial estimated labels of both tasks are produced, and then they are leveraged in the second stage to model the mutual guidances.\nSpecifically, we propose two \\textit{heterogeneous graph attention networks} working on the proposed two \\textit{heterogeneous semantics-label graphs}, which effectively represent the relations among the semantics nodes and label nodes.\nExperiment results show that our model outperforms existing models by a large margin, obtaining a relative improvement of 19.3\\% over the previous best model on MixATIS dataset in overall accuracy.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.305692672729492, 7.068643569946289], "id": 385, "title": "The Importance of Being Parameters: An Intra-Distillation Method for Serious Gains", "authors": "Haoran Xu, Philipp Koehn and Kenton Murray", "abstract": "Recent model pruning methods have demonstrated the ability to remove redundant parameters without sacrificing model performance. Common methods remove redundant parameters according to the parameter sensitivity, a gradient-based measure reflecting the contribution of the parameters. In this paper, however, we argue that redundant parameters can be trained to make beneficial contributions. We first highlight the large sensitivity (contribution) gap among high-sensitivity and low-sensitivity parameters and show that the model generalization performance can be significantly improved after balancing the contribution of all parameters. Our goal is to balance the sensitivity of all parameters and encourage all of them to contribute equally. We propose a general task-agnostic method, namely intra-distillation, appended to the regular training loss to balance parameter sensitivity. Moreover, we also design a novel adaptive learning method to control the strength of intra-distillation loss for faster convergence. Our experiments show the strong effectiveness of our methods on machine translation, natural language understanding, and zero-shot cross-lingual transfer across up to 48 languages, e.g., a gain of 3.54 BLEU on average across 8 language pairs from the IWSLT'14 dataset.", "track": "Machine Translation", "label": 10}, {"loc": [5.427731990814209, 5.067387104034424], "id": 449, "title": "Interpreting Language Models with Contrastive Explanations", "authors": "Kayo Yin and Graham Neubig", "abstract": "Model interpretability methods are often used to explain NLP model decisions on tasks such as text classification, where the output space is relatively small. However, when applied to language generation, where the output space often consists of tens of thousands of tokens, these methods are unable to provide informative explanations. \nLanguage models must consider various features to predict a token, such as its part of speech, number, tense, or semantics.\nExisting explanation methods conflate evidence for all these features into a single explanation, which is less interpretable for human understanding.\n\nTo disentangle the different decisions in language modeling, we focus on explaining language models contrastively: we look for salient input tokens that explain why the model predicted one token instead of another. We demonstrate that contrastive explanations are quantifiably better than non-contrastive explanations in verifying major grammatical phenomena, and that they significantly improve contrastive model simulatability for human observers. We also identify groups of contrastive decisions where the model uses similar evidence, and we are able to characterize what input tokens models use during various language generation decisions.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.9568190574646, 8.789165496826172], "id": 461, "title": "RankGen: Improving Text Generation with Large Ranking Models", "authors": "Kalpesh Krishna, Yapei Chang, John Wieting and Mohit Iyyer", "abstract": "Given an input sequence (or prefix), modern language models often assign high probabilities to output sequences that are repetitive, incoherent, or irrelevant to the prefix; as such, model-generated text also contains such artifacts. To address these issues we present RankGen, a 1.2B parameter encoder model for English that scores model generations given a prefix. RankGen can be flexibly incorporated as a scoring function in beam search and used to decode from any pretrained language model. We train RankGen using large-scale contrastive learning to map a prefix close to the ground-truth sequence that follows it and far away from two types of negatives: (1) random sequences from the same document as the prefix, and (2) sequences generated from a large language model conditioned on the prefix. Experiments across four different language models (345M-11B parameters) and two domains show that RankGen significantly outperforms decoding algorithms like nucleus, top-k, and typical sampling on both automatic metrics (85.0 vs 77.3 MAUVE) as well as human evaluations with English writers (74.5% human preference over nucleus sampling). Analysis reveals that RankGen outputs are more relevant to the prefix and improve continuity and coherence compared to baselines. We release our model checkpoints, code, and human preference data with explanations to facilitate future research.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.709000587463379, 11.713682174682617], "id": 528, "title": "Learning a Grammar Inducer from Massive Uncurated Instructional Videos", "authors": "Songyang Zhang, Linfeng Song, Lifeng Jin, Haitao Mi, Kun Xu, Dong Yu and Jiebo Luo", "abstract": "Video-aided grammar induction aims to leverage video information for finding more accurate syntactic grammars for accompanying text. While previous work focuses on building systems for inducing grammars on text that are well-aligned with video content, we investigate the scenario, in which text and video are only in loose correspondence. Such data can be found in abundance online, and the weak correspondence is similar to the indeterminacy problem studied in language acquisition. Furthermore, we build a new model that can better learn video-span correlation without manually designed features adopted by previous work. Experiments show that our model trained only on large-scale YouTube data with no text-video alignment reports strong and robust performances across three unseen datasets, despite domain shift and noisy label issues. Furthermore our model yields higher F1 scores than the previous state-of-the-art systems trained on in-domain data.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [5.963115215301514, 11.924002647399902], "id": 538, "title": "Normalized Contrastive Learning for Text-Video Retrieval", "authors": "Yookoon Park, Mahmoud Azab, Seungwhan Moon, Bo Xiong, Florian Metze, Gourab Kundu and Kirmani Ahmed", "abstract": "Cross-modal contrastive learning has led the recent advances in multimodal retrieval with its simplicity and effectiveness. In this work, however, we reveal that cross-modal contrastive learning suffers from incorrect normalization of the sum retrieval probabilities of each text or video instance. Specifically, we show that many test instances are either over- or under-represented during retrieval, significantly hurting the retrieval performance. To address this problem, we propose Normalized Contrastive Learning (NCL) which utilizes the Sinkhorn-Knopp algorithm to compute the instance-wise biases that properly normalize the sum retrieval probabilities of each instance so that every text and video instance is fairly represented during cross-modal retrieval. Empirical study shows that NCL brings consistent and significant gains in text-video retrieval on different model architectures, with new state-of-the-art multimodal retrieval metrics on the ActivityNet, MSVD, and MSR-VTT datasets without any architecture engineering.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.356980800628662, 7.513444423675537], "id": 567, "title": "Estimating Soft Labels for Out-of-Domain Intent Detection", "authors": "Hao Lang, Yinhe Zheng, Jian Sun, Fei Huang, Luo Si and Yongbin Li", "abstract": "Out-of-Domain (OOD) intent detection is important for practical dialog systems. To alleviate the issue of lacking OOD training samples, some works propose synthesizing pseudo OOD samples and directly assigning one-hot OOD labels to these pseudo samples. However, these one-hot labels introduce noises to the training process because some ``hard'' pseudo OOD samples may coincide with In-Domain (IND) intents. In this paper, we propose an adaptive soft pseudo labeling (ASoul) method that can estimate soft labels for pseudo OOD samples when training OOD detectors. Semantic connections between pseudo OOD samples and IND intents are captured using an embedding graph. A co-training framework is further introduced to produce resulting soft labels following the smoothness assumption, i.e., close samples are likely to have similar labels. Extensive experiments on three benchmark datasets show that ASoul consistently improves the OOD detection performance and outperforms various competitive baselines.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.092829704284668, 12.21367073059082], "id": 594, "title": "Multi-VQG: Generating Engaging Questions for Multiple Images", "authors": "Min-Hsuan Yeh, Vincent Foster Chen, Ting-Hao Kenneth Huang and Lun-Wei Ku", "abstract": "Generating engaging content has drawn much recent attention in the NLP community. Asking questions is a natural way to respond to photos and promote awareness. However, most answers to questions in traditional question-answering (QA) datasets are factoids, which reduce individuals' willingness to answer. Furthermore, traditional visual question generation (VQG) confines the source data for question generation to single images, resulting in a limited ability to comprehend time-series information of the underlying event. In this paper, we propose generating engaging questions from multiple images. We present MVQG, a new dataset, and establish a series of baselines, including both end-to-end and dual-stage architectures. Results show that building stories behind the image sequence enables models to\ngenerate engaging questions, which confirms our assumption that people typically construct a picture of the event in their minds before asking questions. These results open up an exciting challenge for visual-and-language models to implicitly construct a story behind a series of photos to allow for creativity and experience sharing and hence draw attention to downstream applications.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.361759901046753, 4.537327289581299], "id": 615, "title": "Tomayto, Tomahto. Beyond Token-level Answer Equivalence for Question Answering Evaluation", "authors": "Jannis Bulian, Christian Buck, Wojciech Gajewski, Benjamin B\u00f6rschinger and Tal Schuster", "abstract": "The predictions of question answering (QA) systems are typically evaluated against manually annotated finite sets of one or more answers. This leads to a coverage limitation that results in underestimating the true performance of systems, and is typically addressed by extending over exact match (EM) with predefined rules or with the token-level F1 measure.\nIn this paper, we present the first systematic conceptual and data-driven analysis to examine the shortcomings of token-level equivalence measures.\n\nTo this end, we define the asymmetric notion of answer equivalence (AE), accepting answers that are equivalent to or improve over the reference, and publish over 23k human judgements for candidates produced by multiple QA systems on SQuAD.\n\nThrough a careful analysis of this data, we reveal and quantify several concrete limitations of the F1 measure, such as a false impression of graduality, or missing dependence on the question.\n\nSince collecting AE annotations for each evaluated model is expensive, we learn a BERT matching (BEM) measure to approximate this task. Being a simpler task than QA, we find BEM to provide significantly better AE approximations than F1, and to more accurately reflect the performance of systems.\n\nFinally, we demonstrate the practical utility of AE and BEM on the concrete application of minimal accurate prediction sets, reducing the number of required answers by up to X2.6.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.201896667480469, 7.435024738311768], "id": 633, "title": "Non-Parametric Domain Adaptation for End-to-End Speech Translation", "authors": "Yichao Du, Weizhi Wang, Zhirui Zhang, Boxing Chen, Tong Xu, Jun Xie and Enhong Chen", "abstract": "The end-to-end speech translation (E2E-ST) has received increasing attention due to the potential of its less error propagation, lower latency and fewer parameters. However, the effectiveness of neural-based approaches to this task is severely limited by the available training corpus, especially for domain adaptation where in-domain triplet data is scarce or nonexistent. In this paper, we propose a novel non-parametric method that leverages in-domain text translation corpus to achieve domain adaptation for E2E-ST systems. To this end, we first incorporate an additional encoder into the pre-trained E2E-ST model to realize text translation modeling, based on which the decoder's output representations for text and speech translation tasks are unified by reducing the correspondent representation mismatch in available triplet training data. During domain adaptation, a k-nearest-neighbor (kNN) classifier is introduced to produce the final translation distribution using the external datastore built by the domain-specific text translation corpus, while the universal output representation is adopted to perform a similarity search. Experiments on the Europarl-ST benchmark demonstrate that when in-domain text translation data is involved only, our proposed approach significantly improves baseline by 12.82 BLEU on average in all translation directions, even outperforming the strong in-domain fine-tuning strategy.", "track": "Machine Translation", "label": 10}, {"loc": [7.496976852416992, 12.38073444366455], "id": 669, "title": "Prompting for Multimodal Hateful Meme Classification", "authors": "Rui Cao, Roy Ka-Wei Lee, Wen-Haw Chong and Jing Jiang", "abstract": "Hateful meme classification is a challenging multimodal task that requires complex reasoning and contextual background knowledge. Ideally, we could leverage an explicit external knowledge base to supplement contextual and cultural information in hateful memes. However, there is no known explicit external knowledge base that could provide such hate speech contextual information. To address this gap, we propose PromptHate, a simple yet effective prompt-based model that prompts pre-trained language models (PLMs) for hateful meme classification. Specifically, we construct simple prompts and provide a few in-context examples to exploit the implicit knowledge in the pre-trained RoBERTa language model for hateful meme classification. We conduct extensive experiments on two publicly available hateful and offensive meme datasets. Our experiment results show that PromptHate is able to achieve a high AUC of 90.96, outperforming state-of-the-art baselines on the hateful meme classification task. We also perform fine-grain analyses and case studies on various prompt settings and demonstrate the effectiveness of the prompts on hateful meme classification.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [1.8098297119140625, 3.895242929458618], "id": 675, "title": "Certified Error Control of Candidate Set Pruning for Two-Stage Relevance Ranking", "authors": "Minghan Li, Xinyu Zhang, Ji Xin, Hongyang Zhang and Jimmy Lin", "abstract": "In information retrieval (IR), candidate set pruning has been commonly used to speed up two-stage relevance ranking. However, such an approach lacks accurate error control and often trades accuracy against computational efficiency in an empirical fashion, missing theoretical guarantees. In this paper, we propose the concept of certified error control of candidate set pruning for relevance ranking, which means that the test error after pruning is guaranteed to be controlled under a user-specified threshold with high probability. Both in-domain and out-of-domain experiments show that our method successfully prunes the first-stage retrieved candidate sets to improve the second-stage reranking speed while satisfying the pre-specified accuracy constraints in both settings. For example, on MS MARCO Passage v1, our method reduces the average candidate set size from 1000 to 27, increasing reranking speed by about 37 times, while keeping MRR@10 greater than a pre-specified value of 0.38 with about 90% empirical coverage. In contrast, empirical baselines fail to meet such requirements. Code and data are available at: https://github.com/alexlimh/CEC-Ranking.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [9.630433082580566, 8.070455551147461], "id": 686, "title": "Linearizing Transformer with Key-Value Memory", "authors": "Yizhe Zhang and Deng Cai", "abstract": "Efficient transformer variants with linear time complexity have been developed to mitigate the quadratic computational overhead of the vanilla transformer. Among them are low-rank projection methods such as Linformer and kernel-based Transformers. Despite their unique merits, they usually suffer from a performance drop comparing with the vanilla transformer on many sequence generation tasks, and often fail to obtain computation gain when the generation is short. We propose Memsizer, an approach towards closing the performance gap while improving the efficiency even with short generation. It projects the source sequences into lower dimension representations like Linformer, while enjoying efficient recurrent-style incremental computation similar to kernel-based transformers. This yields linear computation time and constant memory complexity at inference time. Memsizer also employs a lightweight multi-head mechanism which renders the computation as light as a single-head model. We demonstrate that Memsizer provides an improved balance between efficiency and accuracy over the vanilla transformer and other efficient transformer variants in three typical sequence generation tasks, including machine translation, abstractive text summarization, and language modeling.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.293059825897217, 12.311273574829102], "id": 689, "title": "Robustness of Fusion-based Multimodal Classifiers to Cross-Modal Content Dilutions", "authors": "Gaurav Verma, Vishwa Vinay, Ryan Rossi and Srijan Kumar", "abstract": "As multimodal learning finds applications in a wide variety of high-stakes societal tasks, investigating their robustness becomes important. Existing work has focused on understanding the robustness of vision-and-language models to imperceptible variations on benchmark tasks. In this work, we investigate the robustness of multimodal classifiers to cross-modal dilutions \u2013 a plausible variation. We develop a model that, given a multimodal (image + text) input, generates additional dilution text that (a) maintains relevance and topical coherence with the image and existing text, and (b) when added to the original text, leads to misclassification of the multimodal input. Via experiments on Crisis Humanitarianism and Sentiment Detection tasks, we find that the performance of task-specific fusion-based multimodal classifiers drops by 23.3% and 22.5%, respectively, in the presence of dilutions generated by our model. Metric-based comparisons with several baselines and human evaluations indicate that our dilutions show higher relevance and topical coherence, while simultaneously being more effective at demonstrating the brittleness of the multimodal classifiers. Our work aims to highlight and encourage further research on the robustness of deep multimodal models to realistic variations, especially in human-facing societal applications.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.207403659820557, 11.77302360534668], "id": 698, "title": "Translation between Molecules and Natural Language", "authors": "Carl Edwards, Tuan M. Lai, Kevin Ros, Garrett Honke, Kyunghyun Cho and Heng Ji", "abstract": "We present MolT5 - a self-supervised learning framework for pretraining models on a vast amount of unlabeled natural language text and molecule strings. MolT5 allows for new, useful, and challenging analogs of traditional vision-language tasks, such as molecule captioning and text-based de novo molecule generation (altogether: translation between molecules and language), which we explore for the first time. Since MolT5 pretrains models on single-modal data, it helps overcome the chemistry domain shortcoming of data scarcity. Furthermore, we consider several metrics, including a new cross-modal embedding-based metric, to evaluate the tasks of molecule captioning and text-based molecule generation. Our results show that MolT5-based models are able to generate outputs, both molecules and captions, which in many cases are high quality.", "track": "NLP Applications", "label": 0}, {"loc": [7.785195827484131, 8.157483100891113], "id": 701, "title": "What Makes Instruction Learning Hard? An Investigation and a New Challenge in a Synthetic Environment", "authors": "Matthew Finlayson, Kyle Richardson, Ashish Sabharwal and Peter Clark", "abstract": "The instruction learning paradigm---where a model learns to perform new tasks from task descriptions alone---has become popular in research on general-purpose models. The capabilities of large transformer models as instruction learners, however, remain poorly understood. We use a controlled synthetic environment to characterize such capabilities. Specifically, we use the task of deciding whether a given string matches a regular expression (viewed as an instruction) to identify properties of tasks, instructions, and instances that make instruction learning challenging. For instance, we find that our model, a fine-tuned T5-based text2text transformer, struggles with large regular languages, suggesting that less precise instructions are challenging for models. Instruction executions that require tracking longer contexts of prior steps are also difficult. We use our findings to systematically construct a challenging instruction learning dataset, which we call Hard RegSet. Fine-tuning on Hard RegSet, our large transformer learns to correctly interpret (with at least 90\\% accuracy) only 65.6\\% of test instructions, and 11\\%-24\\% of the instructions in out-of-distribution generalization settings. We thus propose Hard RegSet as a challenging instruction learning dataset, and a controlled environment for studying instruction learning.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.3501217365264893, 7.850691318511963], "id": 702, "title": "Sentence-Incremental Neural Coreference Resolution", "authors": "Matt Grenander, Shay B. Cohen and Mark Steedman", "abstract": "We propose a sentence-incremental neural coreference resolution system which incrementally builds clusters after marking mention boundaries in a shift-reduce method. The system is aimed at bridging two recent approaches at coreference resolution: (1) state-of-the-art non-incremental models that incur quadratic complexity in document length with high computational cost, and (2) memory network-based models which operate incrementally but do not generalize beyond pronouns. For comparison, we simulate an incremental setting by constraining non-incremental systems to form partial coreference chains before observing new sentences. In this setting, our system outperforms comparable state-of-the-art methods by 2 F1 on OntoNotes and 6.8 F1 on the CODI-CRAC 2021 corpus. In a conventional coreference setup, our system achieves 76.3 F1 on OntoNotes and 45.5 F1 on CODI-CRAC 2021, which is comparable to state-of-the-art baselines. We also analyze variations of our system and show that the degree of incrementality in the encoder has a surprisingly large effect on the resulting performance.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [3.921996593475342, 9.257647514343262], "id": 735, "title": "SNaC: Coherence Error Detection for Narrative Summarization", "authors": "Tanya Goyal, Junyi Jessy Li and Greg Durrett", "abstract": "Progress in summarizing long texts is inhibited by the lack of appropriate evaluation frameworks. A long summary that appropriately covers the facets of that text must also present a coherent narrative, but current automatic and human evaluation methods fail to identify gaps in coherence. In this work, we introduce SNaC, a narrative coherence evaluation framework for fine-grained annotations of long summaries. We develop a taxonomy of coherence errors in generated narrative summaries and collect span-level annotations for 6.6k sentences across 150 book and movie summaries. Our work provides the first characterization of coherence errors generated by state-of-the-art summarization models and a protocol for eliciting coherence judgments from crowdworkers. Furthermore, we show that the collected annotations allow us to benchmark past work in coherence modeling and train a strong classifier for automatically localizing coherence errors in generated summaries. Finally, our SNaC framework can support future work in long document summarization and coherence evaluation, including improved summarization modeling and post-hoc summary correction.", "track": "Summarization", "label": 14}, {"loc": [3.8970723152160645, 9.954273223876953], "id": 737, "title": "HydraSum: Disentangling Style Features in Text Summarization with Multi-Decoder Models", "authors": "Tanya Goyal, Nazneen Rajani, Wenhao Liu and Wojciech Kryscinski", "abstract": "Summarization systems make numerous ``decisions'' about summary properties during inference, e.g. degree of copying, specificity and length of outputs, etc. However, these are implicitly encoded within model parameters and specific styles cannot be enforced. To address this, we introduce HydraSum, a new summarization architecture that extends the single decoder framework of current models to a mixture-of-experts version with multiple decoders. We show that HydraSum's multiple decoders automatically learn contrasting summary styles when trained under the standard training objective without any extra supervision. Through experiments on three summarization datasets (CNN, Newsroom and XSum), we show that HydraSum provides a simple mechanism to obtain stylistically-diverse summaries by sampling from either individual decoders or their mixtures, outperforming baseline models. Finally, we demonstrate that a small modification to the gating strategy during training can enforce an even stricter style partitioning, e.g. high- vs low-abstractiveness or high- vs low-specificity, allowing users to sample from a larger area in the generation space and vary summary styles along multiple dimensions.", "track": "Summarization", "label": 14}, {"loc": [0.58744215965271, 7.263391017913818], "id": 776, "title": "A Good Neighbor, A Found Treasure: Mining Treasured Neighbors for Knowledge Graph Entity Typing", "authors": "Zhuoran Jin, Pengfei Cao, Yubo Chen, Kang Liu and Jun Zhao", "abstract": "The task of knowledge graph entity typing (KGET) aims to infer the missing types for entities in knowledge graphs. Some pioneering work has proved that neighbor information is very important for the task. However, existing methods only leverage the one-hop neighbor information of the central entity, ignoring the multi-hop neighbor information that can provide valuable clues for inference. Besides, we also observe that there are co-occurrence relations between types, which is very helpful to alleviate false-negative problem. In this paper, we propose a novel method called Mining Treasured Neighbors (MiNer) to make use of these two characteristics. Firstly, we devise a Neighbor Information Aggregation module to aggregate the neighbor information. Then, we propose an Entity Type Inference module to mitigate the adverse impact of the irrelevant neighbor information. Finally, a Type Co-occurrence Regularization module is designed to prevent the model from overfitting the false negative examples caused by missing types. Experimental results on two widely used datasets indicate that our approach significantly outperforms previous state-of-the-art methods.", "track": "Information Extraction", "label": 5}, {"loc": [0.4618845283985138, 7.0692243576049805], "id": 781, "title": "Guiding Neural Entity Alignment with Compatibility", "authors": "Bing Liu, Harrisen Scells, Wen Hua, Guido Zuccon, Genghong Zhao and Xia Zhang", "abstract": "Entity Alignment (EA) aims to find equivalent entities between two Knowledge Graphs (KGs). While numerous neural EA models have been devised, they are mainly learned using labelled data only. In this work, we argue that different entities within one KG should have compatible counterparts in the other KG due to the potential dependencies among the entities. Making compatible predictions thus should be one of the goals of training an EA model along with fitting the labelled data: this aspect however is neglected in current methods. To power neural EA models with compatibility, we devise a training framework by addressing three problems: (1) how to measure the compatibility of an EA model; (2) how to inject the property of being compatible into an EA model; (3) how to optimise parameters of the compatibility model. Extensive experiments on widely-used datasets demonstrate the advantages of integrating compatibility within EA models. In fact, state-of-the-art neural EA models trained within our framework using just 5% of the labelled data can achieve comparable effectiveness with supervised training using 20% of the labelled data.", "track": "NLP Applications", "label": 0}, {"loc": [4.35542106628418, 7.514373779296875], "id": 790, "title": "InstructDial: Improving Zero and Few-shot Generalization in Dialogue through Instruction Tuning", "authors": "Prakhar Gupta, Cathy Jiao, Yi-Ting Yeh, Shikib Mehri, Maxine Eskenazi and Jeffrey P. Bigham", "abstract": "Instruction tuning is an emergent paradigm in NLP wherein natural language instructions are leveraged with language models to induce zero-shot performance on unseen tasks. Dialogue is an especially interesting area in which to explore instruction tuning because dialogue systems perform multiple kinds of tasks related to language (e.g., natural language understanding and generation, domain-specific interaction), yet instruction tuning has not been systematically explored for dialogue-related tasks. We introduce InstructDial, an instruction tuning framework for dialogue, which consists of a repository of 48 diverse dialogue tasks in a unified text-to-text format created from 59 openly available dialogue datasets. We explore cross-task generalization ability on models tuned on InstructDial across diverse dialogue tasks. Our analysis reveals that InstructDial enables good zero-shot performance on unseen datasets and tasks such as dialogue evaluation and intent detection, and even better performance in a few-shot setting. To ensure that models adhere to instructions, we introduce novel meta-tasks. We establish benchmark zero-shot and few-shot performance of models trained using the proposed framework on multiple dialogue tasks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.592419624328613, 6.591548442840576], "id": 792, "title": "Unsupervised Boundary-Aware Language Model Pretraining for Chinese Sequence Labeling", "authors": "Peijie Jiang, Dingkun Long, Yanzhao Zhang, Pengjun Xie, Meishan Zhang and Min Zhang", "abstract": "Boundary information is critical for various Chinese language processing tasks, such as word segmentation, part-of-speech tagging, and named entity recognition. Previous studies usually resorted to the use of a high-quality external lexicon, where lexicon items can offer explicit boundary information. However, to ensure the quality of the lexicon, great human effort is always necessary, which has been generally ignored. In this work, we suggest unsupervised statistical boundary information instead, and propose an architecture to encode the information directly into pre-trained language models, resulting in Boundary-Aware BERT (BABERT). We apply BABERT for feature induction of Chinese sequence labeling tasks. Experimental results on ten benchmarks of Chinese sequence labeling demonstrate that BABERT can provide consistent improvements on all datasets. In addition, our method can complement previous supervised lexicon exploration, where further improvements can be achieved when integrated with external lexicon information.", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [1.8531019687652588, 3.931609869003296], "id": 862, "title": "RetroMAE: Pre-Training Retrieval-oriented Language Models Via Masked Auto-Encoder", "authors": "Shitao Xiao, Zheng Liu, Yingxia Shao and Zhao Cao", "abstract": "Despite pre-training's progress in many important NLP tasks, it remains to explore effective pre-training strategies for dense retrieval. In this paper, we propose RetroMAE, a new retrieval oriented pre-training paradigm based on Masked Auto-Encoder (MAE). RetroMAE is highlighted by three critical designs. 1) A novel MAE workflow, where the input sentence is polluted for encoder and decoder with different masks. The sentence embedding is generated from the encoder's masked input; then, the original sentence is recovered based on the sentence embedding and the decoder's masked input via masked language modeling. 2) Asymmetric model structure, with a full-scale BERT like transformer as encoder, and a one-layer transformer as decoder. 3) Asymmetric masking ratios, with a moderate ratio for encoder: 15~30%, and an aggressive ratio for decoder: 50~70%. Our framework is simple to realize and empirically competitive: the pre-trained models dramatically improve the SOTA performances on a wide range of dense retrieval benchmarks, like BEIR and MS MARCO. The source code and pre-trained models are made publicly available at https://github.com/staoxiao/RetroMAE so as to inspire more interesting research.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.091543436050415, 6.207213401794434], "id": 865, "title": "Aligning Recommendation and Conversation via Dual Imitation", "authors": "Jinfeng Zhou, Bo Wang, Minlie Huang, Dongming Zhao, Kun Huang, Ruifang He and Yuexian Hou", "abstract": "Human conversations of recommendation naturally involve the shift of interests which can align the recommendation actions and conversation process to make accurate recommendations with rich explanations. However, existing conversational recommendation systems (CRS) ignore the advantage of user interest shift in connecting recommendation and conversation, which leads to an ineffective loose coupling structure of CRS. To address this issue, by modeling the recommendation actions as recommendation paths in a knowledge graph (KG), we propose DICR (\\textbf{D}ual \\textbf{I}mitation for \\textbf{C}onversational \\textbf{R}ecommendation), which designs a dual imitation to explicitly align the recommendation paths and user interest shift paths in a recommendation module and a conversation module, respectively. By exchanging alignment signals, DICR achieves bidirectional promotion between recommendation and conversation modules and generates high-quality responses with accurate recommendations and coherent explanations. Experiments demonstrate that DICR outperforms the state-of-the-art models on recommendation and conversation performance with automatic, human, and novel explainability metrics.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.0356905460357666, 4.634920120239258], "id": 888, "title": "QRelScore: Better Evaluating Generated Questions with Deeper Understanding of Context-aware Relevance", "authors": "Xiaoqiang Wang, Bang Liu, Siliang Tang and Lingfei Wu", "abstract": "Existing metrics for assessing question generation not only require costly human reference but also fail to take into account the input context of generation, rendering the lack of deep understanding of the relevance between the generated questions and input contexts. As a result, they may wrongly penalize a legitimate and reasonable candidate question when it (1) involves complicated reasoning with the context or (2) can be grounded by multiple evidences in the context.\nIn this paper, we propose QRelScore, a context-aware Relevance evaluation metric for Question Generation.\nBased on off-the-shelf language models such as BERT and GPT2, QRelScore employs both word-level hierarchical matching and sentence-level prompt-based generation to cope with the complicated reasoning and diverse generation from multiple evidences, respectively.\nCompared with existing metrics, our experiments demonstrate that QRelScore is able to achieve a higher correlation with human judgments while being much more robust to adversarial samples.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.2229228019714355, 12.128870964050293], "id": 929, "title": "Abstract Visual Reasoning with Tangram Shapes", "authors": "Anya Ji, Noriyuki Kojima, Noah J. Rush, Alane Suhr, Wai Keen Vong, Robert Hawkins and Yoav Artzi", "abstract": "We introduce KiloGram, a resource for studying abstract visual reasoning in humans and machines. Drawing on the history of tangram puzzles as stimuli in cognitive science, we build a richly annotated dataset that, with >1k distinct stimuli, is orders of magnitude larger and more diverse than prior resources. It is both visually and linguistically richer, moving beyond whole shape descriptions to include segmentation maps and part labels. We use this resource to evaluate the abstract visual reasoning capacities of recent multi-modal models. We observe that pre-trained weights demonstrate limited abstract reasoning, which dramatically improves with fine-tuning. We also observe that explicitly describing parts aids abstract reasoning for both humans and models, especially when jointly encoding the linguistic and visual inputs.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [1.7461912631988525, 5.390502452850342], "id": 939, "title": "UnifiedSKG: Unifying and Multi-Tasking Structured Knowledge Grounding with Text-to-Text Language Models", "authors": "Tianbao Xie, Chen Henry Wu, Peng Shi, Ruiqi Zhong, Torsten Scholak, Michihiro Yasunaga, Chien-Sheng Wu, Ming Zhong, Pengcheng Yin, Sida I. Wang, Victor Zhong, Bailin Wang, Chengzu Li, Connor Boyle, Ansong Ni, Ziyu Yao, Dragomir Radev, Caiming Xiong, Lingpeng Kong, Rui Zhang, Noah A. Smith, Luke Zettlemoyer and Tao Yu", "abstract": "Structured knowledge grounding (SKG) leverages structured knowledge to complete user requests, such as semantic parsing over databases and question answering over knowledge bases. Since the inputs and outputs of SKG tasks are heterogeneous, they have been studied separately by different communities, which limits systematic and compatible research on SKG. In this paper, we overcome this limitation by proposing the UnifiedSKG framework, which unifies 21 SKG tasks into a text-to-text format, aiming to promote systematic SKG research, instead of being exclusive to a single task, domain, or dataset. We use UnifiedSKG to benchmark T5 with different sizes and show that T5, with simple modifications when necessary, achieves state-of-the-art performance on almost all of the 21 tasks. We further demonstrate that multi-task prefix-tuning improves the performance on most tasks, largely improving the overall performance. UnifiedSKG also facilitates the investigation of zero-shot and few-shot learning, and we show that T0, GPT-3, and Codex struggle in zero-shot and few-shot learning for SKG. We also use UnifiedSKG to conduct a series of controlled experiments on structured knowledge encoding variants across SKG tasks. UnifiedSKG is easily extensible to more tasks, and it is open-sourced at https://github.com/hkunlp/unifiedskg.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.533488750457764, 3.741511344909668], "id": 987, "title": "Balanced Adversarial Training: Balancing Tradeoffs between Fickleness and Obstinacy in NLP Models", "authors": "Hannah Chen, Yangfeng Ji and David Evans", "abstract": "Traditional (fickle) adversarial examples involve finding a small perturbation that does not change an input's true label but confuses the classifier into outputting a different prediction. Conversely, obstinate adversarial examples occur when an adversary finds a small perturbation that preserves the classifier's prediction but changes the true label of an input.\nAdversarial training and certified robust training have shown some effectiveness in improving the robustness of machine learnt models to fickle adversarial examples. We show that standard adversarial training methods focused on reducing vulnerability to fickle adversarial examples may make a model more vulnerable to obstinate adversarial examples, with experiments for both natural language inference and paraphrase identification tasks. To counter this phenomenon, we introduce Balanced Adversarial Training, which incorporates contrastive learning to increase robustness against both fickle and obstinate adversarial examples.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.382798671722412, 7.218418598175049], "id": 991, "title": "When Can Transformers Ground and Compose: Insights from Compositional Generalization Benchmarks", "authors": "Ankur Sikarwar, Arkil Patel and Navin Goyal", "abstract": "Humans can reason compositionally whilst grounding language utterances to the real world. Recent benchmarks like ReaSCAN (Wu et al., 2021) use navigation tasks grounded in a grid world to assess whether neural models exhibit similar capabilities. In this work, we present a simple transformer-based model that outperforms specialized architectures on ReaSCAN and a modified version (Qiu et al., 2021) of gSCAN (Ruis et al., 2020). On analyzing the task, we find that identifying the target location in the grid world is the main challenge for the models. Furthermore, we show that a particular split in ReaSCAN, which tests depth generalization, is unfair. On an amended version of this split, we show that transformers can generalize to deeper input structures. Finally, we design a simpler grounded compositional generalization task, RefEx, to investigate how transformers reason compositionally. We show that a single self-attention layer with a single head generalizes to novel combinations of object attributes. Moreover, we derive a precise mathematical construction of the transformer's computations from the learned network. Overall, we provide valuable insights about the grounded compositional generalization task and the behaviour of transformers on it, which would be useful for researchers working in this area.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.014382839202881, 4.7776079177856445], "id": 1002, "title": "Generative Language Models for Paragraph-Level Question Generation", "authors": "Asahi Ushio, Fernando Alva-Manchego and Jose Camacho-Collados", "abstract": "Powerful generative models have led to recent progress in question generation (QG). However, it is difficult to measure advances in QG research since there are no standardized resources that allow a uniform comparison among approaches. In this paper, we introduce QG-Bench, a multilingual and multidomain benchmark for QG that unifies existing question answering datasets by converting them to a standard QG setting. It includes general-purpose datasets such as SQuAD for English, datasets from ten domains and two styles, as well as datasets in eight different languages. Using QG-Bench as a reference, we perform an extensive analysis of the capabilities of language models for the task. First, we propose robust QG baselines based on fine-tuning generative language models. Then, we complement automatic evaluation based on standard metrics with an extensive manual evaluation, which in turn sheds light on the difficulty of evaluating QG models. Finally, we analyse both the domain adaptability of these models as well as the effectiveness of multilingual models in languages other than English.\nQG-Bench is released along with the fine-tuned models presented in the paper (https://github.com/asahi417/lm-question-generation), which are also available as a demo (https://autoqg.net/).", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.033627510070801, 8.440853118896484], "id": 1008, "title": "A Unified Encoder-Decoder Framework with Entity Memory", "authors": "Zhihan Zhang, Wenhao Yu, Chenguang Zhu and Meng Jiang", "abstract": "Entities, as important carriers of real-world knowledge, play a key role in many NLP tasks.\nWe focus on incorporating entity knowledge into an encoder-decoder framework for informative text generation. Existing approaches tried to index, retrieve, and read external documents as evidence, but they suffered from a large computational overhead. In this work, we propose an encoder-decoder framework with an entity memory, namely EDMem. The entity knowledge is stored in the memory as latent representations, and the memory is pre-trained on Wikipedia along with encoder-decoder parameters. To precisely generate entity names, we design three decoding methods to constrain entity generation by linking entities in the memory. EDMem is a unified framework that can be used on various entity-intensive question answering and generation tasks. Extensive experimental results show that EDMem outperforms both memory-based auto-encoder models and non-memory encoder-decoder models.", "track": "Natural Language Generation", "label": 6}, {"loc": [7.889179706573486, 3.405285596847534], "id": 1010, "title": "Segmenting Numerical Substitution Ciphers", "authors": "Nada Aldarrab and Jonathan May", "abstract": "Deciphering historical substitution ciphers is a challenging problem. Example problems that have been previously studied include detecting cipher type, detecting plaintext language, and acquiring the substitution key for segmented ciphers. However, attacking unsegmented ciphers is still a challenging task. Segmentation (i.e. finding substitution units) is essential for cracking those ciphers. In this work, we propose the first automatic methods to segment those ciphers using Byte Pair Encoding (BPE) and unigram language models. Our methods achieve an average segmentation error of 2\\% on 100 randomly-generated monoalphabetic ciphers and 27\\% on 3 real historical homophonic ciphers. We also propose a method for solving non-deterministic ciphers with existing keys using a lattice and a pretrained language model. Our method leads to the full solution of the IA cipher; a real historical cipher that has not been fully solved until this work.", "track": "NLP Applications", "label": 0}, {"loc": [6.120254039764404, 12.40794563293457], "id": 1034, "title": "Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset", "authors": "Ashish V. Thapliyal, Jordi Pont Tuset, Xi Chen and Radu Soricut", "abstract": "Research in massively multilingual image captioning has been severely hampered by a lack of high-quality evaluation datasets. In this paper we present the Crossmodal-3600 dataset (XM3600 in short), a geographically diverse set of 3600 images annotated with human-generated reference captions in 36 languages. The images were selected from across the world, covering regions where the 36 languages are spoken, and annotated with captions that achieve consistency in terms of style across all languages, while avoiding annotation artifacts due to direct translation. We apply this benchmark to model selection for massively multilingual image captioning models, and show superior correlation results with human evaluations when using XM3600 as golden references for automatic metrics.", "track": "Multilinguality", "label": 13}, {"loc": [0.6602703332901001, 7.840590953826904], "id": 1035, "title": "ReSel: N-ary Relation Extraction from Scientific Text and Tables by Learning to Retrieve and Select", "authors": "Yuchen Zhuang, Yinghao Li, Junyang Zhang, Yue Yu, Yingjun Mou, Xiang Chen, Le Song and Chao Zhang", "abstract": "We study the problem of extracting N-ary relation tuples from scientific articles. This task is challenging because the target knowledge tuples can reside in multiple parts and modalities of the document. Our proposed method ReSel decomposes this task into a two-stage procedure that first retrieves the most relevant paragraph/table and then selects the target entity from the retrieved component. For the high-level retrieval stage, ReSel designs a simple and effective feature set, which captures multi-level lexical and semantic similarities between the query and components. For the low-level selection stage, ReSel designs a cross-modal entity correlation graph along with a multi-view architecture, which models both semantic and document-structural relations between entities. Our experiments on three scientific information extraction datasets show that ReSel outperforms state-of-the-art baselines significantly.", "track": "Information Extraction", "label": 5}, {"loc": [0.5032818913459778, 7.0611138343811035], "id": 1079, "title": "GammaE: Gamma Embeddings for Logical Queries on Knowledge Graphs", "authors": "Dong Yang, QING PEIJUN, Yang Li, Haonan Lu and xiaodong lin", "abstract": "Embedding knowledge graphs (KGs) for multi-hop logical reasoning is a challenging problem due to massive and complicated structures in many KGs. Recently, many promising works projected entities and queries into a geometric space to efficiently find answers. However, it remains challenging to model the negation and union operator. The negation operator has no strict boundaries, which generates overlapped embeddings and leads to obtaining ambiguous answers. An additional limitation is that the union operator is non-closure, which undermines the model to handle a series of union operators. To address these problems, we propose a novel probabilistic embedding model, namely Gamma Embeddings (GammaE), for encoding entities and queries to answer different types of FOL queries on KGs. We utilize the linear property and strong boundary support of the Gamma distribution to capture more features of entities and queries, which dramatically reduces model uncertainty. Furthermore, GammaE implements the Gamma mixture method to design the closed union operator. The performance of GammaE is validated on three large logical query datasets. Experimental results show that GammaE significantly outperforms state-of-the-art models on public benchmarks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.091821193695068, 4.093278884887695], "id": 1110, "title": "Reasoning Like Program Executors", "authors": "Xinyu Pi, Qian Liu, Bei Chen, Morteza Ziyadi, Zeqi Lin, Qiang Fu, Yan Gao, Jian-Guang LOU and Weizhu Chen", "abstract": "Reasoning over natural language is a long-standing goal for the research community. However, studies have shown that existing language models are inadequate in reasoning. To address the issue, we present POET, a novel reasoning pre-training paradigm. Through pre-training language models with programs and their execution results, POET empowers language models to harvest the reasoning knowledge possessed by program executors via a data-driven approach. POET is conceptually simple and can be instantiated by different kinds of program executors. In this paper, we showcase two simple instances POET-Math and POET-Logic, in addition to a complex instance, POET-SQL. Experimental results on six benchmarks demonstrate that POET can significantly boost model performance in natural language reasoning, such as numerical reasoning, logical reasoning, and multi-hop reasoning. POET opens a new gate on reasoning-enhancement pre-training, and we hope our analysis would shed light on the future research of reasoning like program executors.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.984764337539673, 9.305280685424805], "id": 1113, "title": "SEM-F1: an Automatic Way for Semantic Evaluation of Multi-Narrative Overlap Summaries at Scale", "authors": "Naman Bansal, Mousumi Akter and Shubhra Kanti Karmaker", "abstract": "Recent work has introduced an important yet relatively under-explored NLP task called Semantic Overlap Summarization (SOS) that entails generating a summary from multiple alternative narratives which conveys the common information provided by those narratives. Previous work also published a benchmark dataset for this task by collecting 2,925 alternative narrative pairs from the web and manually annotating 411 different reference summaries by engaging human annotators. In this paper, we exclusively focus on the automated evaluation of the SOS task using the benchmark dataset. More specifically, we first use the popular ROUGE metric from text-summarization literature and conduct a systematic study to evaluate the SOS task. Our experiments discover that ROUGE is not suitable for this novel task and therefore, we propose a new sentence-level precision-recall style automated evaluation metric, called SEM-F1 (Semantic F1). It is inspired by the benefits of the sentence-wise annotation technique using overlap labels reported by the previous work. Our experiments show that the proposed SEM-F1 metric yields a higher correlation with human judgment and higher inter-rater agreement compared to the ROUGE metric.", "track": "Summarization", "label": 14}, {"loc": [8.33066177368164, 8.465265274047852], "id": 1116, "title": "Inducer-tuning: Connecting Prefix-tuning and Adapter-tuning", "authors": "Yifan Chen, Devamanyu Hazarika, Mahdi Namazifar, Yang Liu, Di Jin and Dilek Hakkani-Tur", "abstract": "Prefix-tuning, or more generally continuous prompt tuning, has become an essential paradigm of parameter-efficient transfer learning. Using a large pre-trained language model (PLM), prefix-tuning can obtain strong performance by training only a small portion of parameters. In this paper, we propose to understand and further develop prefix-tuning through the kernel lens. Specifically, we make an analogy between \\textit{prefixes} and \\textit{inducing variables} in kernel methods and hypothesize that \\textit{prefixes} serving as \\textit{inducing variables} would improve their overall mechanism. From the kernel estimator perspective, we suggest a new variant of prefix-tuning---\\textit{inducer-tuning}, which shares the exact mechanism as prefix-tuning while leveraging the residual form found in adapter-tuning. This mitigates the initialization issue in prefix-tuning. Through comprehensive empirical experiments on natural language understanding and generation tasks, we demonstrate that inducer-tuning can close the performance gap between prefix-tuning and fine-tuning.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.771486759185791, 4.629601955413818], "id": 1128, "title": "DocInfer: Document-level Natural Language Inference using Optimal Evidence Selection", "authors": "Puneet Mathur, Gautam Kunapuli, Riyaz Ahmad Bhat, Manish Shrivastava, Dinesh Manocha and Maneesh Singh", "abstract": "We present DocInfer - a novel, end-to-end Document-level Natural Language Inference model that builds a hierarchical document graph enriched through inter-sentence relations (topical, entity-based, concept-based), performs paragraph pruning using the novel SubGraph Pooling layer, followed by optimal evidence selection based on REINFORCE algorithm to identify the most important context sentences for a given hypothesis. Our evidence selection mechanism allows it to transcend the input length limitation of modern BERT-like Transformer models while presenting the entire evidence together for inferential reasoning. We show this is an important property needed to reason on large documents where the evidence may be fragmented and located arbitrarily far from each other. Extensive experiments on popular corpora - DocNLI, ContractNLI, and ConTRoL datasets, and our new proposed dataset called CaseHoldNLI on the task of legal judicial reasoning, demonstrate significant performance gains of 8-12% over SOTA methods. Our ablation studies validate the impact of our model. Performance improvement of 3-6% on annotation-scarce downstream tasks of fact verification, multiple-choice QA, and contract clause retrieval demonstrates the usefulness of DocInfer beyond primary NLI tasks.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [0.44673553109169006, 7.152421951293945], "id": 1135, "title": "LightEA: A Scalable, Robust, and Interpretable Entity Alignment Framework via Three-view Label Propagation", "authors": "Xin Mao, wenting wang, Yuanbin Wu and Man Lan", "abstract": "Entity Alignment (EA) aims to find equivalent entity pairs between KGs, which is the core step to bridging and integrating multi-source KGs. In this paper, we argue that existing complex EA methods inevitably inherit the inborn defects from their neural network lineage: poor interpretability and weak scalability. Inspired by recent studies, we reinvent the classical Label Propagation algorithm to effectively run on KGs and propose a neural-free EA framework \u2014 LightEA, consisting of three efficient components: (i) Random Orthogonal Label Generation, (ii) Three-view Label Propagation, and (iii) Sparse Sinkhorn Operation.\nAccording to the extensive experiments on public datasets, LightEA has impressive scalability, robustness, and interpretability. With a mere tenth of time consumption, LightEA achieves comparable results to state-of-the-art methods across all datasets and even surpasses them on many. Besides, due to the computational process of LightEA being entirely linear, we could trace the propagation process at each step and clearly explain how the entities are aligned.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.757153511047363, 3.2815983295440674], "id": 1164, "title": "Metric-guided Distillation: Distilling Knowledge from the Metric to Ranker and Retriever for Generative Commonsense Reasoning", "authors": "Xingwei He, Yeyun Gong, A-Long Jin, Weizhen Qi, Hang Zhang, Jian Jiao, Bartuer Zhou, Biao Cheng, SM Yiu and Nan Duan", "abstract": "Commonsense generation aims to generate a realistic sentence describing a daily scene under the given concepts, which is very challenging, since it requires models to have relational reasoning and compositional generalization capabilities. Previous work focuses on retrieving prototype sentences for the provided concepts to assist generation. They first use a sparse retriever to retrieve candidate sentences, then re-rank the candidates with a ranker. However, the candidates returned by their ranker may not be the most relevant sentences, since the ranker treats all candidates equally without considering their relevance to the reference sentences of the given concepts. Another problem is that re-ranking is very expensive, but only using retrievers will seriously degrade the performance of their generation models. To solve these problems, we propose the metric distillation rule to distill knowledge from the metric (e.g., BLEU) to the ranker. We further transfer the critical knowledge summarized by the distilled ranker to the retriever. In this way, the relevance scores of candidate sentences predicted by the ranker and retriever will be more consistent with their quality measured by the metric. Experimental results on the CommonGen benchmark verify the effectiveness of our proposed method: (1) Our generation model with the distilled ranker achieves a new state-of-the-art result. (2) Our generation model with the distilled retriever even surpasses the previous SOTA.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [1.7093737125396729, 3.79512095451355], "id": 1186, "title": "Efficient Document Retrieval by End-to-End Refining and Quantizing BERT Embedding with Contrastive Product Quantization", "authors": "Zexuan Qiu, Qinliang Su, Jianxing Yu and Shijing Si", "abstract": "Efficient document retrieval heavily relies on the technique of semantic hashing, which learns a binary code for every document and employs Hamming distance to evaluate document distances. However, existing semantic hashing methods are mostly established on outdated TFIDF features, which obviously do not contain lots of important semantic information about documents. Furthermore, the Hamming distance can only be equal to one of several integer values, significantly limiting its representational ability for document distances. To address these issues, in this paper, we propose to leverage BERT embeddings to perform efficient retrieval based on the product quantization technique, which will assign for every document a real-valued codeword from the codebook, instead of a binary code as in semantic hashing. Specifically, we first transform the original BERT embeddings via a learnable mapping and feed the transformed embedding into a probabilistic product quantization module to output the assigned codeword. The refining and quantizing modules can be optimized in an end-to-end manner by minimizing the probabilistic contrastive loss. A mutual information maximization based method is further proposed to improve the representativeness of codewords, so that documents can be quantized more accurately. Extensive experiments conducted on three benchmarks demonstrate that our proposed method significantly outperforms current state-of-the-art baselines.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [1.0890947580337524, 10.554510116577148], "id": 1226, "title": "Curriculum Knowledge Distillation for Emoji-supervised Cross-lingual Sentiment Analysis", "authors": "Jianyang Zhang, Tao Liang, Mingyang Wan, Guowu Yang and Fengmao Lv", "abstract": "Existing sentiment analysis models have achieved great advances with the help of sufficient sentiment annotations. Unfortunately, many languages do not have sufficient sentiment corpus. To this end, recent studies have proposed cross-lingual sentiment analysis to transfer sentiment analysis models from resource-rich languages to low-resource languages. However, these studies either rely on external cross-lingual supervision (e.g., parallel corpora and translation model), or are limited by the cross-lingual gaps. In this work, based on the intuitive assumption that the relationships between emojis and sentiments are consistent across different languages, we investigate transferring sentiment knowledge across languages with the help of emojis. To this end, we propose a novel cross-lingual sentiment analysis approach dubbed Curriculum Knowledge Distiller (CKD). The core idea of CKD is to use emojis to bridge the source and target languages. Note that, compared with texts, emojis are more transferable, but cannot reveal the precise sentiment. Thus, we distill multiple Intermediate Sentiment Classifiers (ISC) on source language corpus with emojis to get ISCs with different attention weights of texts. To transfer them into the target language, we distill ISCs into the Target Language Sentiment Classifier (TSC) following the curriculum learning mechanism. In this way, TSC can learn delicate sentiment knowledge, meanwhile, avoid being affected by cross-lingual gaps. Experimental results on five cross-lingual benchmarks clearly verify the effectiveness of our approach.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.446403503417969, 7.6140642166137695], "id": 1227, "title": "Correctable-DST: Mitigating Historical Context Mismatch between Training and Inference for Improved Dialogue State Tracking", "authors": "Hongyan Xie, Haoxiang Su, Shuangyong Song, Hao Huang, Bo Zou, Kun Deng, Jianghua Lin, Zhihui Zhang and Xiaodong He", "abstract": "Recently proposed dialogue state tracking (DST) approaches predict the dialogue state of a target turn sequentially based on the previous dialogue state. During the training time, the ground-truth previous dialogue state is utilized as the historical context. However, only the previously predicted dialogue state can be used in inference. This discrepancy might lead to error propagation, i.e., mistakes made by the model in the current turn are likely to be carried over to the following turns.\nTo solve this problem, we propose Correctable Dialogue State Tracking (Correctable-DST). Specifically, it consists of three stages: (1) a Predictive State Simulator is exploited to generate a previously \"predicted\" dialogue state based on the ground-truth previous dialogue state during training; (2) a Slot Detector is proposed to determine the slots with an incorrect value in the previously \"predicted\" state and the slots whose values are to be updated in the current turn; (3) a State Generator takes the name of the above-selected slots as a prompt to generate the current state.\nEmpirical results show that our approach achieves 67.51%, 68.24%, 70.30%, 71.38%, and 81.27% joint goal accuracy on MultiWOZ 2.0-2.4 datasets, respectively, and achieves a new state-of-the-art performance with significant improvements.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.472158908843994, 9.513226509094238], "id": 1233, "title": "DropMix: A Textual Data Augmentation Combining Dropout with Mixup", "authors": "Fanshuang Kong, Richong Zhang, Xiaohui Guo, Samuel Mensah and Yongyi Mao", "abstract": "Overfitting is a notorious problem when there is insufficient data to train deep neural networks in machine learning tasks. Data augmentation regularization methods such as Dropout, Mixup, and their enhanced variants are effective and prevalent, and achieve promising performance to overcome overfitting. However, in text learning, most of the existing regularization approaches merely adopt ideas from computer vision without considering the importance of dimensionality in natural language processing. In this paper, we argue that the property is essential to overcome overfitting in text learning. Accordingly, we present a saliency map informed textual data augmentation and regularization framework, which combines Dropout and Mixup, namely DropMix, to mitigate the overfitting problem in text learning. In addition, we design a procedure that drops and patches fine grained shapes of the saliency map under the DropMix framework to enhance regularization. Empirical studies confirm the effectiveness of the proposed approach on 12 text classification tasks.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.337517023086548, 7.8153228759765625], "id": 1279, "title": "Cross-document Event Coreference Search: Task, Dataset and Modeling", "authors": "Alon Eirew, Avi Caciularu and Ido Dagan", "abstract": "The task of Cross-document Coreference Resolution has been traditionally formulated as requiring to identify all coreference links across a given set of documents. We propose an appealing, and often more applicable, complementary set up for the task -- Cross-document Coreference Search, focusing in this paper on event coreference. Concretely, given a mention in context of an event of interest, considered as a query, the task is to find all coreferring mentions for the query event in a large document collection. To support research on this task, we create a corresponding dataset, which is derived from Wikipedia while leveraging annotations in the available Wikipedia Event Coreferecene dataset (WEC-Eng). Observing that the coreference search setup is largely analogous to the setting of Open Domain Question Answering, we adapt the prominent Deep Passage Retrieval (DPR) model to our setting, as an appealing baseline. Finally, we present a novel model that integrates a powerful coreference scoring scheme into the DPR architecture, yielding improved performance.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.520313262939453, 6.754205226898193], "id": 1320, "title": "VIRT: Improving Representation-based Text Matching via Virtual Interaction", "authors": "Dan Li, Yang Yang, Hongyin Tang, Jiahao Liu, Qifan Wang, Jingang Wang, Tong Xu, Wei Wu and Enhong Chen", "abstract": "Text matching is a fundamental research problem in natural language understanding. Interaction-based approaches treat the text pair as a single sequence and encode it through cross encoders, while representation-based models encode the text pair independently with siamese or dual encoders. Interaction-based models require dense computations and thus are impractical in real-world applications. Representation-based models have become the mainstream paradigm for efficient text matching. However, these models suffer from severe performance degradation due to the lack of interactions between the pair of texts. To remedy this, we propose a Virtual InteRacTion mechanism (VIRT) for improving representation-based text matching while maintaining its efficiency. In particular, we introduce an interactive knowledge distillation module that is only applied during training. It enables deep interaction between texts by effectively transferring knowledge from the interaction-based model. A light interaction strategy is designed to fully leverage the learned interactive knowledge. Experimental results on six text matching benchmarks demonstrate the superior performance of our method over several state-of-the-art representation-based models. We further show that VIRT can be integrated into existing methods as plugins to lift their performances.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [2.145726442337036, 7.63754940032959], "id": 1343, "title": "MAVEN-ERE: A Unified Large-scale Dataset for Event Coreference, Temporal, Causal, and Subevent Relation Extraction", "authors": "Xiaozhi Wang, Yulin Chen, Ning Ding, Hao Peng, Zimu Wang, Yankai Lin, Xu Han, Lei Hou, Juanzi Li, Zhiyuan Liu, Peng Li and Jie Zhou", "abstract": "The diverse relationships among real-world events, including coreference, temporal, causal, and subevent relations, are fundamental to understanding natural languages. However, two drawbacks of existing datasets limit event relation extraction (ERE) tasks: (1) Small scale. Due to the annotation complexity, the data scale of existing datasets is limited, which cannot well train and evaluate data-hungry models. (2) Absence of unified annotation. Different types of event relations naturally interact with each other, but existing datasets only cover limited relation types at once, which prevents models from taking full advantage of relation interactions. To address these issues, we construct a unified large-scale human-annotated ERE dataset MAVEN-ERE with improved annotation schemes. It contains 103,193 event coreference chains, 1,216,217 temporal relations, 57,992 causal relations, and 15,841 subevent relations, which is larger than existing datasets of all the ERE tasks by at least an order of magnitude. Experiments show that ERE on MAVEN-ERE is quite challenging, and considering relation interactions with joint learning can improve performances. The dataset and source codes can be obtained from https://github.com/THU-KEG/MAVEN-ERE.", "track": "Information Extraction", "label": 5}, {"loc": [1.7272932529449463, 9.077872276306152], "id": 1344, "title": "Entity Extraction in Low Resource Domains with Selective Pre-training of Large Language Models", "authors": "Aniruddha Mahapatra, Sharmila Reddy Nangi, Aparna Garimella and Anandhavelu N", "abstract": "Transformer-based language models trained on large natural language corpora have been very useful in downstream entity extraction tasks. However, they often result in poor performances when applied to domains that are different from those they are pretrained on. Continued pretraining using unlabeled data from target domains can help improve the performances of these language models on the downstream tasks. However, using all of the available unlabeled data for pretraining can be time-intensive; also, it can be detrimental to the performance of the downstream tasks, if the unlabeled data is not aligned with the data distribution for the target tasks. Previous works employed external supervision in the form of ontologies for selecting appropriate data samples for pretraining, but external supervision can be quite hard to obtain in low-resource domains. In this paper, we introduce effective ways to select data from unlabeled corpora of target domains for language model pretraining to improve the performances in target entity extraction tasks. Our data selection strategies do not require any external supervision. We conduct extensive experiments for the task of named entity recognition (NER) on seven different domains and show that language models pretrained on target domain unlabeled data obtained using our data selection strategies achieve better performances compared to those using data selection strategies in previous works that use external supervision. We also show that these pretrained language models using our data selection strategies outperform those pretrained on all of the available unlabeled target domain data.", "track": "Information Extraction", "label": 5}, {"loc": [8.030102729797363, 5.1703104972839355], "id": 1347, "title": "How Large Language Models are Transforming Machine-Paraphrase Plagiarism", "authors": "Jan Philip Wahle, Terry Ruas, Frederic Kirstein and Bela Gipp", "abstract": "The recent success of large language models for text generation poses a severe threat to academic integrity, as plagiarists can generate realistic paraphrases indistinguishable from original work.\nHowever, the role of large autoregressive models in generating machine-paraphrased plagiarism and their detection is still incipient in the literature.\nThis work explores T5 and GPT3 for machine-paraphrase generation on scientific articles from arXiv, student theses, and Wikipedia.\nWe evaluate the detection performance of six automated solutions and one commercial plagiarism detection software and perform a human study with 105 participants regarding their detection performance and the quality of generated examples.\nOur results suggest that large language models can rewrite text humans have difficulty identifying as machine-paraphrased (53% mean acc.).\nHuman experts rate the quality of paraphrases generated by GPT-3 as high as original texts (clarity 4.0/5, fluency 4.2/5, coherence 3.8/5).\nThe best-performing detection model (GPT-3) achieves 66% F1-score in detecting paraphrases.\nWe make our code, data, and findings publicly available to facilitate the development of detection solutions.", "track": "NLP Applications", "label": 0}, {"loc": [8.093953132629395, 8.708601951599121], "id": 1348, "title": "M2D2: A Massively Multi-Domain Language Modeling Dataset", "authors": "Machel Reid, Victor Zhong, Suchin Gururangan and Luke Zettlemoyer", "abstract": "We present M2D2, a fine-grained, massively multi-domain corpus for studying domain adaptation in language models (LMs). M2D2 consists of 8.5B tokens and spans 145 domains extracted from Wikipedia and Semantic Scholar. Using ontologies derived from Wikipedia and ArXiv categories, we organize the domains in each data source into 22 groups. This two-level hierarchy enables the study of relationships between domains and their effects on in- and out-of-domain performance after adaptation. We also present a number of insights into the nature of effective domain adaptation in LMs, as examples of the new types of studies M2D2 enables. To improve in-domain performance, we show the benefits of adapting the LM along a domain hierarchy; adapting to smaller amounts of fine-grained domain-specific data can lead to larger in-domain performance gains than larger amounts of weakly relevant data. We further demonstrate a trade-off between in-domain specialization and out-of-domain generalization within and across ontologies, as well as a strong correlation between out-of-domain performance and lexical overlap between domains.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.498634338378906, 5.076426029205322], "id": 1369, "title": "\"Will You Find These Shortcuts?\" A Protocol for Evaluating the Faithfulness of Input Salience Methods for Text Classification", "authors": "Jasmijn Bastings, Sebastian Ebert, Polina Zablotskaia, Anders Sandholm and Katja Filippova", "abstract": "Feature attribution a.k.a. input salience methods which assign an importance score to a feature are abundant but may produce surprisingly different results for the same model on the same input. While differences are expected if disparate definitions of importance are assumed, most methods claim to provide faithful attributions and point at the features most relevant for a model's prediction. Existing work on faithfulness evaluation is not conclusive and does not provide a clear answer as to how different methods are to be compared.\nFocusing on text classification and the model debugging scenario, our main contribution is a protocol for faithfulness evaluation that makes use of partially synthetic data to obtain ground truth for feature importance ranking. Following the protocol, we do an in-depth analysis of four standard salience method classes on a range of datasets and lexical shortcuts for BERT and LSTM models. We demonstrate that some of the most popular method configurations provide poor results even for simple shortcuts while a method judged to be too simplistic works remarkably well for BERT.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [10.334426879882812, 7.665778636932373], "id": 1410, "title": "Information-Transport-based Policy for Simultaneous Translation", "authors": "Shaolei Zhang and Yang Feng", "abstract": "Simultaneous translation (ST) outputs translation while receiving the source inputs, and hence requires a policy to determine whether to translate a target token or wait for the next source token. The major challenge of ST is that each target token can only be translated based on the current received source tokens, where the received source information will directly affect the translation quality. So naturally, how much source information is received for the translation of the current target token is supposed to be the pivotal evidence for the ST policy to decide between translating and waiting. In this paper, we treat the translation as information transport from source to target and accordingly propose an Information-Transport-based Simultaneous Translation (ITST). ITST quantifies the transported information weight from each source token to the current target token, and then decides whether to translate the target token according to its accumulated received information. Experiments on both text-to-text ST and speech-to-text ST (a.k.a., streaming speech translation) tasks show that ITST outperforms strong baselines and achieves state-of-the-art performance.", "track": "Machine Translation", "label": 10}, {"loc": [8.135863304138184, 5.209171295166016], "id": 1416, "title": "Learning to Adapt to Low-Resource Paraphrase Generation", "authors": "Zhigen Li, Yanmeng Wang, Rizhao Fan, Ye Wang, Jianfeng Li and Shaojun Wang", "abstract": "Paraphrase generation is a longstanding NLP task and achieves great success with the aid of large corpora. However, transferring a paraphrasing model to another domain encounters the problem of domain shifting especially when the data is sparse. At the same time, widely using large pre-trained language models (PLMs) faces the overfitting problem when training on scarce labeled data. To mitigate these two issues, we propose, LAPA, an effective adapter for PLMs optimized by meta-learning. LAPA has three-stage training on three types of related resources to solve this problem: 1. pre-training PLMs on unsupervised corpora, 2. inserting an adapter layer and meta-training on source domain labeled data, and 3. fine-tuning adapters on a small amount of target domain labeled data. This method enables paraphrase generation models to learn basic language knowledge first, then learn the paraphrasing task itself later, and finally adapt to the target task. Our experimental results demonstrate that LAPA achieves state-of-the-art in supervised, unsupervised, and low-resource settings on three benchmark datasets. \nWith only 2% of trainable parameters and 1% labeled data of the target task, our approach can achieve a competitive performance with previous work.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.886528015136719, 8.78001594543457], "id": 1435, "title": "A Distributional Lens for Multi-Aspect Controllable Text Generation", "authors": "Yuxuan Gu, Xiaocheng Feng, Sicheng Ma, Lingyuan Zhang, Heng Gong and Bing Qin", "abstract": "Multi-aspect controllable text generation is a more challenging and practical task than single-aspect control. Existing methods achieve complex multi-aspect control by fusing multiple controllers learned from single-aspect, but suffer from attribute degeneration caused by the mutual interference of these controllers. To address this, we provide observations on attribute fusion from a distributional perspective and propose to directly search for the intersection areas of multiple attribute distributions as their combination for generation. Our method first estimates the attribute space with an autoencoder structure. Afterward, we iteratively approach the intersections by jointly minimizing distances to points representing different attributes. Finally, we map them to attribute-relevant sentences with a prefix-tuning-based decoder. Experiments on the three-aspect control task, including sentiment, topic, and detoxification aspects, reveal that our method outperforms several strong baselines on attribute relevance and text quality and achieves the SOTA. Further analysis also supplies some explanatory support for the effectiveness of our approach.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.816460609436035, 8.775479316711426], "id": 1478, "title": "ELMER: A Non-Autoregressive Pre-trained Language Model for Efficient and Effective Text Generation", "authors": "Junyi Li, Tianyi Tang, Wayne Xin Zhao, Jian-Yun Nie and Ji-Rong Wen", "abstract": "We study the text generation task under the approach of pre-trained language models (PLMs). Typically, an auto-regressive (AR) method is adopted for generating texts in a token-by-token manner. \nDespite many advantages of AR generation, it usually suffers from inefficient inference. Therefore, non-autoregressive (NAR) models are proposed to generate all target tokens simultaneously. However, NAR models usually generate texts of lower quality due to the absence of token dependency in the output text. In this paper, we propose ELMER: an efficient and effective PLM for NAR text generation to explicitly model the token dependency during NAR generation. By leveraging the early exit technique, ELMER enables the token generations at different layers, according to their prediction confidence (a more confident token will exit at a lower layer). Besides, we propose a novel pre-training objective, Layer Permutation Language Modeling, to pre-train ELMER by permuting the exit layer for each token in sequences. Experiments on three text generation tasks show that ELMER significantly outperforms NAR models and further narrows the performance gap with AR PLMs (\\eg ELMER (29.92) vs BART (30.61) ROUGE-L in XSUM) while achieving over 10 times inference speedup.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.9178818464279175, 8.154624938964844], "id": 1479, "title": "Multilingual Relation Classification via Efficient and Effective Prompting", "authors": "Yuxuan Chen, David Harbecke and Leonhard Hennig", "abstract": "Prompting pre-trained language models has achieved impressive performance on various NLP tasks, especially in low data regimes. Despite the success of prompting in monolingual settings, applying prompt-based methods in multilingual scenarios has been limited to a narrow set of tasks, due to the high cost of handcrafting multilingual prompts. In this paper, we present the first work on prompt-based multilingual relation classification (RC), by introducing an efficient and effective method that constructs prompts from relation triples and involves only minimal translation for the class labels. We evaluate its performance in fully supervised, few-shot and zero-shot scenarios, and analyze its effectiveness across 14 languages, prompt variants, and English-task training in cross-lingual settings. We find that in both fully supervised and few-shot scenarios, our prompt method beats competitive baselines: fine-tuning XLM-R_EM and null prompts. It also outperforms the random baseline by a large margin in zero-shot experiments. Our method requires little in-language knowledge and can be used as a strong baseline for similar multilingual classification tasks.", "track": "Information Extraction", "label": 5}, {"loc": [7.071919918060303, 4.849345684051514], "id": 1536, "title": "Topic-Regularized Authorship Representation Learning", "authors": "Jitkapat Sawatphol, Nonthakit Chaiwong, Can Udomcharoenchaikit and Sarana Nutanong", "abstract": "Authorship attribution is a task that aims to identify the author of a given piece of writing. We aim to develop a generalized solution that can handle a large number of texts from authors and topics unavailable in training data. Previous studies have proposed strategies to address only either unseen authors or unseen topics. Authorship representation learning has been shown to work in open-set environments with a large number of unseen authors but has not been explicitly designed for cross-topic environments at the same time. To handle a large number of unseen authors and topics, we propose Authorship Representation Regularization (ARR), a distillation framework that creates authorship representation with reduced reliance on topic-specific information. To assess the performance of our framework, we also propose a cross-topic-open-set evaluation method. Our proposed method has improved performances in the cross-topic-open set setup over baselines in 4 out of 6 cases.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [0.7960485816001892, 8.063212394714355], "id": 1573, "title": "Fine-grained Contrastive Learning for Relation Extraction", "authors": "William Hogan, Jiacheng Li and Jingbo Shang", "abstract": "Recent relation extraction (RE) works have shown encouraging improvements by conducting contrastive learning on silver labels generated by distant supervision before fine-tuning on gold labels. Existing methods typically assume all these silver labels are accurate and treat them equally; however, distant supervision is inevitably noisy\u2013some silver labels are more reliable than others. In this paper, we propose fine-grained contrastive learning (FineCL) for RE, which leverages fine-grained information about which silver labels are and are not noisy to improve the quality of learned relationship representations for RE. We first assess the quality of silver labels via a simple and automatic approach we call \"learning order denoising,\" where we train a language model to learn these relations and record the order of learned training instances. We show that learning order largely corresponds to label accuracy\u2013early-learned silver labels have, on average, more accurate labels than later-learned silver labels. Then, during pre-training, we increase the weights of accurate labels within a novel contrastive learning objective. Experiments on several RE benchmarks show that FineCL makes consistent and significant performance gains over state-of-the-art methods.", "track": "Information Extraction", "label": 5}, {"loc": [3.661283254623413, 8.038297653198242], "id": 1590, "title": "Curriculum Prompt Learning with Self-Training for Abstractive Dialogue Summarization", "authors": "Changqun Li, Linlin Wang, xin Lin, Gerard de Melo and Liang He", "abstract": "Succinctly summarizing dialogue is a task of growing interest, but inherent challenges, such as insufficient training data and low information density impede our ability to train abstractive models. In this work, we propose a novel curriculum-based prompt learning method with self-training to address these problems. Specifically, prompts are learned using a curriculum learning strategy that gradually increases the degree of prompt perturbation, thereby improving the dialogue understanding and modeling capabilities of our model. Unlabeled dialogue is incorporated by means of self-training so as to reduce the dependency on labeled data. We further investigate topic-aware prompts to better plan for the generation of summaries. Experiments confirm that our model substantially outperforms strong baselines and achieves new state-of-the-art results on the AMI and ICSI datasets. Human evaluations also show the superiority of our model with regard to the summary generation quality.", "track": "Natural Language Generation", "label": 6}, {"loc": [8.017522811889648, 9.721717834472656], "id": 1635, "title": "Zero-Shot Text Classification with Self-Training", "authors": "Ariel Gera, Alon Halfon, Eyal Shnarch, Yotam Perlitz, Liat Ein-Dor and Noam Slonim", "abstract": "Recent advances in large pretrained language models have increased attention to zero-shot text classification. In particular, models finetuned on natural language inference datasets have been widely adopted as zero-shot classifiers due to their promising results and off-the-shelf availability. However, the fact that such models are unfamiliar with the target task can lead to instability and performance issues. We propose a plug-and-play method to bridge this gap using a simple self-training approach, requiring only the class names along with an unlabeled dataset, and without the need for domain expertise or trial and error. We show that fine-tuning the zero-shot classifier on its most confident predictions leads to significant performance gains across a wide range of text classification tasks, presumably since self-training adapts the zero-shot model to the task at hand.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.86292028427124, 4.750800132751465], "id": 1704, "title": "Deconfounding Legal Judgment Prediction for European Court of Human Rights Cases Towards Better Alignment with Experts", "authors": "T.Y.S.S Santosh, Shanshan Xu, Oana Ichim and Matthias Grabmair", "abstract": "This work demonstrates that Legal Judgement Prediction systems without expert-informed adjustments can be vulnerable to shallow, distracting surface signals that arise from corpus construction, case distribution, and confounding factors. To mitigate this, we use domain expertise to strategically identify statistically predictive but legally irrelevant information. We adopt adversarial training to prevent the system from relying on it. We evaluate our deconfounded models by employing interpretability techniques and comparing to expert annotations. Quantitative experiments and qualitative analysis show that our deconfounded model consistently aligns better with expert rationales than baselines trained for prediction only. We further contribute a set of reference expert annotations to the validation and testing partitions of an existing benchmark dataset of European Court of Human Rights cases.", "track": "NLP Applications", "label": 0}, {"loc": [3.68505859375, 9.543838500976562], "id": 1733, "title": "SQuALITY: Building a Long-Document Summarization Dataset the Hard Way", "authors": "Alex Wang, Richard Yuanzhe Pang, Angelica Chen, Jason Phang and Samuel R. Bowman", "abstract": "Summarization datasets are often assembled either by scraping naturally occurring public-domain summaries---which are nearly always in difficult-to-work-with technical domains---or by using approximate heuristics to extract them from everyday text---which frequently yields unfaithful summaries. In this work, we turn to a slower but more straightforward approach to developing summarization benchmark data: We hire highly-qualified contractors to read stories and write original summaries from scratch. To amortize reading time, we collect five summaries per document, with the first giving an overview and the subsequent four addressing specific questions. We use this protocol to collect SQuALITY, a dataset of question-focused summaries built on the same public-domain short stories as the multiple-choice dataset QuALITY (Pang et al., 2021). Experiments with state-of-the-art summarization systems show that our dataset is challenging and that existing automatic evaluation metrics are weak indicators of quality.", "track": "Summarization", "label": 14}, {"loc": [4.5531697273254395, 7.628974437713623], "id": 1770, "title": "MetaASSIST: Robust Dialogue State Tracking with Meta Learning", "authors": "Fanghua Ye, xi wang, Jie Huang, Shenghui Li, Samuel Stern and Emine Yilmaz", "abstract": "Existing dialogue datasets contain lots of noise in their state annotations. Such noise can hurt model training and ultimately lead to poor generalization performance. A general framework named ASSIST has recently been proposed to train robust dialogue state tracking (DST) models. It introduces an auxiliary model to generate pseudo labels for the noisy training set. These pseudo labels are combined with vanilla labels by a common fixed weighting parameter to train the primary DST model. Notwithstanding the improvements of ASSIST on DST, tuning the weighting parameter is challenging. Moreover, a single parameter shared by all slots and all instances may be suboptimal. To overcome these limitations, we propose a meta learning-based framework MetaASSIST to adaptively learn the weighting parameter. Specifically, we propose three schemes with varying degrees of flexibility, ranging from slot-wise to both slot-wise and instance-wise, to convert the weighting parameter into learnable functions. These functions are trained in a meta-learning manner by taking the validation set as meta data. Experimental results demonstrate that all three schemes can achieve competitive performance. Most impressively, we achieve a state-of-the-art joint goal accuracy of 80.10% on MultiWOZ 2.4.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.939080238342285, 6.70028829574585], "id": 1781, "title": "Multilingual Machine Translation with Hyper-Adapters", "authors": "Christos Baziotis, Mikel Artetxe, James Cross and Shruti Bhosale", "abstract": "Multilingual machine translation suffers from negative interference across languages. A common solution is to relax parameter sharing with language-specific modules like adapters. However, adapters of related languages are unable to transfer information, and their total number of parameters becomes prohibitively expensive as the number of languages grows. In this work, we overcome these drawbacks using hyper-adapters -- hyper-networks that generate adapters from language and layer embeddings. While past work had poor results when scaling hyper-networks, we propose a rescaling fix that significantly improves convergence and enables training larger hyper-networks. We find that hyper-adapters are more parameter efficient than regular adapters, reaching the same performance with up to 12 times less parameters. When using the same number of parameters and FLOPS, our approach consistently outperforms regular adapters. Also, hyper-adapters converge faster than alternative approaches and scale better than regular dense networks. Our analysis shows that hyper-adapters learn to encode language relatedness, enabling positive transfer across languages.", "track": "Machine Translation", "label": 10}, {"loc": [5.380457401275635, 12.177384376525879], "id": 1823, "title": "Z-LaVI: Zero-Shot Language Solver Fueled by Visual Imagination", "authors": "Yue Yang, Wenlin Yao, Hongming Zhang, Xiaoyang Wang, Dong Yu and Jianshu Chen", "abstract": "Large-scale pretrained language models have made significant advances in solving downstream language understanding tasks. However, they generally suffer from reporting bias, the phenomenon describing the lack of explicit commonsense knowledge in written text, e.g., ''an orange is orange''. To overcome this limitation, we develop a novel approach, Z-LaVI, to endow language models with visual imagination capabilities. Specifically, we leverage two complementary types of ''imaginations'': (i) recalling existing images through retrieval and (ii) synthesizing nonexistent images via text-to-image generation. Jointly exploiting the language inputs and the imagination, a pretrained vision-language model (e.g., CLIP) eventually composes a zero-shot solution to the original language tasks. Notably, fueling language models with imagination can effectively leverage visual knowledge to solve plain language tasks. In consequence, Z-LaVI consistently improves the zero-shot performance of existing language models across a diverse set of language tasks.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.838489055633545, 3.3683929443359375], "id": 1833, "title": "Using Commonsense Knowledge to Answer Why-Questions", "authors": "Yash Kumar Lal, Niket Tandon, Tanvi Aggarwal, Horace Liu, Nathanael Chambers, Raymond Mooney and Niranjan Balasubramanian", "abstract": "Answering questions in narratives about {why events happened often requires commonsense knowledge external to the text. What aspects of this knowledge are available in large language models? What aspects can be made accessible via external commonsense resources? We study these questions in the context of answering questions in the TellMeWhy dataset using COMET as a source of relevant commonsense relations. We analyze the effects of model size (T5 and GPT3) along with methods of injecting knowledge (COMET) into these models. Results show that the largest models, as expected, yield substantial improvements over base models. Injecting external knowledge helps models of various sizes, but the amount of improvement decreases with larger model size. We also find that the format in which knowledge is provided is critical, and that smaller models benefit more from larger amounts of knowledge. Finally, we develop an ontology of knowledge types and analyze the relative coverage of the models across these categories.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [5.727928161621094, 6.170063018798828], "id": 1834, "title": "Affective Idiosyncratic Responses to Music", "authors": "Sky CH-Wang, Evan Li, Oliver Li, Smaranda Muresan and Zhou Yu", "abstract": "Affective responses to music are highly personal. Despite consensus that idiosyncratic factors play a key role in regulating how listeners emotionally respond to music, precisely measuring the marginal effects of these variables has proved challenging. To address this gap, we develop computational methods to measure affective responses to music from over 403M listener comments on a Chinese social music platform. Building on studies from music psychology in systematic and quasi-causal analyses, we test for musical, lyrical, contextual, demographic, and mental health effects that drive listener affective responses. Finally, motivated by the social phenomenon known as \u7f51\u6291\u4e91 (w\u01ceng-y\u00ec-y\u00fan), we identify influencing factors of platform user self-disclosures, the social support they receive, and notable differences in discloser user activity.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [3.1884076595306396, 4.628033638000488], "id": 1846, "title": "Successive Prompting for Decomposing Complex Questions", "authors": "Dheeru Dua, Shivanshu Gupta, Sameer Singh and Matt Gardner", "abstract": "Answering complex questions that require making latent decisions is a challenging task, especially when limited supervision is available. Recent works leverage the capabilities of large language models (LMs) to perform complex question answering in a few-shot setting by demonstrating how to output intermediate rationalizations while solving the complex question in a single pass. We introduce ``Successive Prompting'' where, we iteratively break down a complex task into a simple task, solve it, and then repeat the process until we get the final solution. Successive prompting decouples the supervision for decomposing complex questions from the supervision for answering simple questions, allowing us to (1) have multiple opportunities to query in-context examples at each reasoning step (2) learn question decomposition separately from question answering, including using synthetic data, and (3) use bespoke (fine-tuned) components for reasoning steps where a large LM does not perform well. The intermediate supervision is typically manually written, which can be expensive to collect. We introduce a way to generate synthetic dataset which can be used to bootstrap model's ability to decompose and answer intermediate questions. Our best model (with successive prompting) achieves an improvement in F1 of ~5% when compared with a state-of-the-art model with synthetic augmentations and few-shot version of the DROP dataset.", "track": "Question Answering", "label": 11}, {"loc": [4.4579620361328125, 4.39894437789917], "id": 1864, "title": "Maieutic Prompting: Logically Consistent Reasoning with Recursive Explanations", "authors": "Jaehun Jung, Lianhui Qin, Sean Welleck, Faeze Brahman, Chandra Bhagavatula, Ronan Le Bras and Yejin Choi", "abstract": "Pre-trained language models (LMs) struggle with consistent reasoning; recently, prompting LMs to generate explanations that self-guide the inference has emerged as a promising direction to amend this. However, these approaches are fundamentally bounded by the correctness of explanations, which themselves are often noisy and inconsistent. In this work, we develop Maieutic Prompting, which aims to infer a correct answer to a question even from the unreliable generations of LM. Maieutic Prompting induces a tree of explanations abductively (e.g. X is true, because ...) and recursively, then frames the inference as a satisfiability problem over these explanations and their logical relations. We test Maieutic Prompting for true/false QA on three challenging benchmarks that require complex commonsense reasoning. Maieutic Prompting achieves up to 20% better accuracy than state-of-the-art prompting methods, and as a fully unsupervised approach, performs competitively with supervised models. We also show that Maieutic Prompting improves robustness in inference while providing interpretable rationales.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [5.693565368652344, 10.654388427734375], "id": 1932, "title": "DANLI: Deliberative Agent for Following Natural Language Instructions", "authors": "Yichi Zhang, Jianing Yang, Jiayi Pan, Shane Storks, Nikhil Devraj, Ziqiao Ma, Keunwoo Peter Yu, Yuwei Bao and Joyce Chai", "abstract": "Recent years have seen an increasing amount of work on embodied AI agents that can perform tasks by following human language instructions. However, most of these agents are reactive, meaning that they simply learn and imitate behaviors encountered in the training data. These reactive agents are insufficient for long-horizon complex tasks. To address this limitation, we propose a neuro-symbolic deliberative agent that, while following language instructions, proactively applies reasoning and planning based on its neural and symbolic representations acquired from past experience (e.g., natural language and egocentric vision). We show that our deliberative agent achieves greater than 70% improvement over reactive baselines on the challenging TEACh benchmark. Moreover, the underlying reasoning and planning processes, together with our modular framework, offer impressive transparency and explainability to the behaviors of the agent. This enables an in-depth understanding of the agent's capabilities, which shed light on challenges and opportunities for future embodied agents for instruction following. The code is available at https://github.com/sled-group/DANLI.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.734060287475586, 5.908910751342773], "id": 1943, "title": "Tracing Semantic Variation in Slang", "authors": "Zhewei Sun and Yang Xu", "abstract": "The meaning of a slang term can vary in different communities. However, slang semantic variation is not well understood and under-explored in the natural language processing of slang. One existing view argues that slang semantic variation is driven by culture-dependent communicative needs. An alternative view focuses on slang's social functions suggesting that the desire to foster semantic distinction may have led to the historical emergence of community-specific slang senses. We explore these theories using computational models and test them against historical slang dictionary entries, with a focus on characterizing regularity in the geographical variation of slang usages attested in the US and the UK over the past two centuries. We show that our models are able to predict the regional identity of emerging slang word meanings from historical slang records. We offer empirical evidence that both communicative need and semantic distinction play a role in the variation of slang meaning yet their relative importance fluctuates over the course of history. Our work offers an opportunity for incorporating historical cultural elements into the natural language processing of slang.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [2.0118186473846436, 8.20055103302002], "id": 1945, "title": "Fine-grained Category Discovery under Coarse-grained supervision with Hierarchical Weighted Self-contrastive Learning", "authors": "Wenbin An, Feng Tian, Ping Chen, Siliang Tang, Qinghua Zheng and QianYing Wang", "abstract": "Novel category discovery aims at adapting models trained on known categories to novel categories. Previous works only focus on the scenario where known and novel categories are of the same granularity.\nIn this paper, we investigate a new practical scenario called Fine-grained Category Discovery under Coarse-grained supervision (FCDC). FCDC aims at discovering fine-grained categories with only coarse-grained labeled data, which can adapt models to categories of different granularity from known ones and reduce significant labeling cost. \nIt is also a challenging task since supervised training on coarse-grained categories tends to focus on inter-class distance (distance between coarse-grained classes) but ignore intra-class distance (distance between fine-grained sub-classes) which is essential for separating fine-grained categories.\nConsidering most current methods cannot transfer knowledge from coarse-grained level to fine-grained level, we propose a hierarchical weighted self-contrastive network by building a novel weighted self-contrastive module and combining it with supervised learning in a hierarchical manner.\nExtensive experiments on public datasets show both effectiveness and efficiency of our model over compared methods.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.595623970031738, 9.58731460571289], "id": 1964, "title": "PLM-based World Models for Text-based Games", "authors": "Minsoo Kim, Yeonjoon Jung, Dohyeon Lee and Seung-won Hwang", "abstract": "World models have improved the ability of reinforcement learning agents to operate in a sample efficient manner, by being trained to predict plausible changes in the underlying environment. As the core tasks of world models are future prediction and commonsense understanding, our claim is that pre-trained language models (PLMs) already provide a strong base upon which to build world models. Worldformer is a recently proposed world model for text-based game environments, based only partially on PLM and transformers. Our distinction is to fully leverage PLMs as actionable world models in text-based game environments, by reformulating generation as constrained decoding which decomposes actions into verb templates and objects. We show that our model improves future valid action prediction and graph change prediction. Additionally, we show that our model better reflects commonsense than standard PLM.", "track": "NLP Applications", "label": 0}, {"loc": [7.959944725036621, 9.772128105163574], "id": 2041, "title": "Prompt-Based Meta-Learning For Few-shot Text Classification", "authors": "Haoxing Zhang, Xiaofeng Zhang, Haibo Huang and Lei Yu", "abstract": "Few-shot Text Classification predicts the semantic label of a given text with a handful of supporting instances. Current meta-learning methods have achieved satisfying results in various few-shot situations. Still, they often require a large amount of data to construct many few-shot tasks for meta-training, which is not practical in real-world few-shot scenarios. Prompt-tuning has recently proved to be another effective few-shot learner by bridging the gap between pre-train and downstream tasks. In this work, we closely combine the two promising few-shot learning methodologies in structure and propose a Prompt-Based Meta-Learning (PBML) model to overcome the above meta-learning problem by adding the prompting mechanism. PBML assigns label word learning to base-learners and template learning to meta-learner, respectively. Experimental results show state-of-the-art performance on four text classification datasets under few-shot settings, with higher accuracy and good robustness. We demonstrate through low-resource experiments that our method alleviates the shortcoming that meta-learning requires too much data for meta-training. In the end, we use the visualization to interpret and verify that the meta-learning framework can help the prompting method converge better. We release our code to reproduce our experiments.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [6.431832790374756, 7.517152786254883], "id": 2132, "title": "Geographic Citation Gaps in NLP Research", "authors": "Mukund Rungta, Janvijay Singh, Saif M. Mohammad and Diyi Yang", "abstract": "In a fair world, people have equitable opportunities to education, to conduct scientific research, to publish, and to get credit for their work, regardless of where they live. However, it is common knowledge among researchers that a vast number of papers accepted at top NLP venues come from a handful of western countries and (lately) China; whereas, very few papers from Africa and South America get published. Similar disparities are also believed to exist for paper citation counts. In the spirit of \"what we do not measure, we cannot improve\u201d, this work asks a series of questions on the relationship between geographical location and publication success (acceptance in top NLP venues and citation impact). We first created a dataset of 70,000 papers from the ACL Anthology, extracted their meta-information, and\ngenerated their citation network. We then show that not only are there substantial geographical disparities in paper acceptance and citation but also that these disparities persist even when controlling for a number of variables such as venue of publication and sub-field of NLP. Further, despite some steps taken by the NLP community to improve geographical diversity, we show that the disparity in publication metrics across locations is still on an increasing trend since the early 2000s. We release our code and dataset here: https://github.com/iamjanvijay/acl-cite-net", "track": "Theme Track", "label": 18}, {"loc": [4.683065414428711, 3.4136130809783936], "id": 2133, "title": "Language Models of Code are Few-Shot Commonsense Learners", "authors": "Aman Madaan, Shuyan Zhou, Uri Alon, Yiming Yang and Graham Neubig", "abstract": "We address the general task of structured commonsense reasoning: given a natural language input, the goal is to generate a graph such as an event or a reasoning-graph.\nTo employ large language models (LMs) for this task, existing approaches 'serialize' the output graph as a flat list of nodes and edges.\nAlthough feasible, these serialized graphs strongly deviate from the natural language corpora that LMs were pre-trained on, hindering LMs from generating them correctly. \nIn this paper, we show that when we instead frame structured commonsense reasoning tasks as code generation tasks, pre-trained LMs of code are better structured commonsense reasoners than LMs of natural language, even when the downstream task does not involve source code at all.\nWe demonstrate our approach across three diverse structured commonsense reasoning tasks. In all these natural language tasks, we show that using our approach, a code generation LM~(codex) outperforms natural-LMs that are fine-tuned on the target task (T5) and other strong LMs such as GPT-3 in the few-shot setting.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [9.428143501281738, 8.06722640991211], "id": 2172, "title": "Numerical Optimizations for Weighted Low-rank Estimation on Language Models", "authors": "Ting Hua, Yen-Chang Hsu, Felicity Wang, Qian Lou, Yilin Shen and Hongxia Jin", "abstract": "Singular value decomposition (SVD) is one of the most popular compression methods that approximate a target matrix with smaller matrices. However, standard SVD treats the parameters within the matrix with equal importance, which is a simple but unrealistic assumption. The parameters of a trained neural network model may affect the task performance unevenly, which suggests non-equal importance among the parameters. Compared to SVD, the decomposition method aware of parameter importance is the more practical choice in real cases. Unlike standard SVD, weighed value decomposition is a non-convex optimization problem that lacks a closed-form solution. \nWe systematically investigated multiple optimization strategies to tackle the problem and examined our method by compressing Transformer-based language models.\nFurther, we designed a metric to predict when the SVD may introduce a significant performance drop, for which our method can be a rescue strategy.\nThe extensive evaluations demonstrate that our method can perform better than current SOTA methods in compressing Transformer-based language models.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.8095660209655762, 3.8797428607940674], "id": 2196, "title": "Generative Multi-hop Retrieval", "authors": "Hyunji Lee, Sohee Yang, Hanseok Oh and Minjoon Seo", "abstract": "A common practice for text retrieval is to use an encoder to map the documents and the query to a common vector space and perform a nearest neighbor search (NNS); multi-hop retrieval also often adopts the same paradigm, usually with a modification of iteratively reformulating the query vector so that it can retrieve different documents at each hop. However, such a bi-encoder approach has limitations in multi-hop settings; (1) the reformulated query gets longer as the number of hops increases, which further tightens the embedding bottleneck of the query vector, and (2) it is prone to error propagation. In this paper, we focus on alleviating these limitations in multi-hop settings by formulating the problem in a fully generative way. We propose an encoder-decoder model that performs multi-hop retrieval by simply generating the entire text sequences of the retrieval targets, which means the query and the documents interact in the language model's parametric space rather than L2 or inner product space as in the bi-encoder approach. Our approach, Generative Multi-hop Retrieval (GMR), consistently achieves comparable or higher performance than bi-encoder models in five datasets while demonstrating superior GPU memory and storage footprint.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.989002704620361, 12.464166641235352], "id": 2204, "title": "Visual Spatial Description: Controlled Spatial-Oriented Image-to-Text Generation", "authors": "Yu Zhao, Jianguo Wei, ZhiChao Lin, Yueheng Sun, Meishan Zhang and Min Zhang", "abstract": "Image-to-text tasks such as open-ended image captioning and controllable image description have received extensive attention for decades. Here we advance this line of work further, presenting Visual Spatial Description (VSD), a new perspective for image-to-text toward spatial semantics. Given an image and two objects inside it, VSD aims to produce one description focusing on the spatial perspective between the two objects. Accordingly, we annotate a dataset manually to facilitate the investigation of the newly-introduced task, and then build several benchmark encoder-decoder models by using VL-BART and VL-T5 as backbones. In addition, we investigate visual spatial relationship classification (VSRC) information into our model by pipeline and end-to-end architectures. Finally, we conduct experiments on our benchmark dataset to evaluate all our models. Results show that our models are awe-inspiring, offering accurate and human-like spatial-oriented text descriptions. Besides, VSRC has great potential for VSD, and the joint end-to-end architecture is the better choice for their integration. We will make the dataset and codes publicly available for research purposes.", "track": "Natural Language Generation", "label": 6}, {"loc": [2.911705255508423, 4.623122692108154], "id": 2221, "title": "M3: A Multi-View Fusion and Multi-Decoding Network for Multi-Document Reading Comprehension", "authors": "Liang Wen, Houfeng Wang, Yingwei Luo and Xiaolin Wang", "abstract": "Multi-document reading comprehension task requires collecting evidences from different documents for answering questions. Previous research works either use the extractive modeling method to naively integrate the scores from different documents on the encoder side or use the generative modeling method to collect the clues from different documents on the decoder side individually. However, any single modeling method cannot make full of the advantages of both. In this work, we propose a novel method that tries to employ a multi-view fusion and multi-decoding mechanism to achieve it. For one thing, our approach leverages question-centered fusion mechanism and cross-attention mechanism to gather fine-grained fusion of evidence clues from different documents in the encoder and decoder concurrently. For another, our method simultaneously employs both the extractive decoding approach and the generative decoding method to effectively guide the training process. Compared with existing methods, our method can perform both extractive decoding and generative decoding independently and optionally. Our experiments on two mainstream multi-document reading comprehension datasets (Natural Questions and TriviaQA) demonstrate that our method can provide consistent improvements over previous state-of-the-art methods.", "track": "Question Answering", "label": 11}, {"loc": [1.807599425315857, 3.897414207458496], "id": 2247, "title": "COCO-DR: Combating the Distribution Shift in Zero-Shot Dense Retrieval with Contrastive and Distributionally Robust Learning", "authors": "Yue Yu, Chenyan Xiong, Si Sun, Chao Zhang and Arnold Overwijk", "abstract": "We present a new zero-shot dense retrieval (ZeroDR) method, COCO-DR, to improve the generalization ability of dense retrieval by combating the distribution shifts between source training tasks and target scenarios. To mitigate the impact of document differences, COCO-DR continues pretraining the language model on the target corpora to adapt the model to target distributions via COtinuous COtrastive learning. To prepare for unseen target queries, COCO-DR leverages implicit Distributionally Robust Optimization (iDRO) to reweight samples from different source query clusters for improving model robustness over rare queries during fine-tuning. \nCOCO-DR achieves superior average performance on BEIR, the zero-shot retrieval benchmark. \nAt BERT_Base scale, COCO-DR Base outperforms other ZeroDR models with 60x larger size. At BERT_Large scale, COCO-DR Large outperforms the giant GPT-3 embedding model which has 500x more parameters. Our analysis shows the correlation between COCO-DR's effectiveness in combating distribution shifts and improving zero-shot accuracy. Our code and model can be found at \\url{https://github.com/OpenMatch/COCO-DR}.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.300162315368652, 7.916706085205078], "id": 2256, "title": "Language Model Pre-Training with Sparse Latent Typing", "authors": "Liliang Ren, Zixuan Zhang, Han Wang, Clare Voss, ChengXiang Zhai and Heng Ji", "abstract": "Modern large-scale Pre-trained Language Models (PLMs) have achieved tremendous success on a wide range of downstream tasks. However, most of the LM pre-training objectives only focus on text reconstruction, but have not sought to learn latent-level interpretable representations of sentences. In this paper, we manage to push the language models to obtain a deeper understanding of sentences by proposing a new pre-training objective, Sparse Latent Typing, which enables the model to sparsely extract sentence-level keywords with diverse latent types. Experimental results show that our model is able to learn interpretable latent type categories in a self-supervised manner without using any external knowledge. Besides, the language model pre-trained with such an objective also significantly improves Information Extraction related downstream tasks in both supervised and few-shot settings. Our code is publicly available at https://github.com/renll/SparseLT .", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.718135833740234, 8.1510009765625], "id": 2260, "title": "On the Transformation of Latent Space in Fine-Tuned NLP Models", "authors": "Nadir Durrani, Hassan Sajjad, Fahim Dalvi and Firoj Alam", "abstract": "We study the evolution of latent space in fine-tuned NLP models. Different from the commonly used probing-framework, we opt for an unsupervised method to analyze representations. More specifically, we discover latent concepts in the representational space using hierarchical clustering. We \nthen use an alignment function to gauge the similarity between the latent space of a pre-trained model and its fine-tuned version. We use traditional linguistic concepts to facilitate our understanding and also study how the model space transforms towards task-specific information. We perform a thorough analysis, comparing pre-trained and fine-tuned models across three models and three downstream tasks. The notable findings of our work are: i) the latent space of the higher layers evolve towards task-specific concepts, ii) whereas the lower layers retain generic concepts acquired in the pre-trained model, iii) we discovered that some concepts in the higher layers acquire polarity towards the output class, and iv) that these concepts can be used for generating adversarial triggers.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.368982315063477, 7.537463665008545], "id": 2298, "title": "Watch the Neighbors: A Unified K-Nearest Neighbor Contrastive Learning Framework for OOD Intent Discovery", "authors": "Yutao Mou, Keqing He, Pei Wang, Yanan Wu, Jingang Wang, Wei Wu and Weiran Xu", "abstract": "Discovering out-of-domain (OOD) intent is important for developing new skills in task-oriented dialogue systems. The key challenges lie in how to transfer prior in-domain (IND) knowledge to OOD clustering, as well as jointly learn OOD representations and cluster assignments. Previous methods suffer from in-domain overfitting problem, and there is a natural gap between representation learning and clustering objectives. In this paper, we propose a unified K-nearest neighbor contrastive learning framework to discover OOD intents. Specifically, for IND pre-training stage, we propose a KCL objective to learn inter-class discriminative features, while maintaining intra-class diversity, which alleviates the in-domain overfitting problem. For OOD clustering stage, we propose a KCC method to form compact clusters by mining true hard negative samples, which bridges the gap between clustering and representation learning. Extensive experiments on three benchmark datasets show that our method achieves substantial improvements over the state-of-the-art methods.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.134905815124512, 3.016488790512085], "id": 2299, "title": "Extracted BERT Model Leaks More Information than You Think!", "authors": "Xuanli He, Lingjuan Lyu, Chen Chen and Qiongkai Xu", "abstract": "The collection and availability of big data, combined with advances in pre-trained models (e.g. BERT), have revolutionized the predictive performance of natural language processing tasks. This allows corporations to provide machine learning as a service (MLaaS) by encapsulating fine-tuned BERT-based models as APIs. Due to significant commercial interest, there has been a surge of attempts to steal remote services via model extraction. Although previous works have made progress in defending against model extraction attacks, there has been little discussion on their performance in preventing privacy leakage. This work bridges this gap by launching an attribute inference attack against the extracted BERT model. Our extensive experiments reveal that model extraction can cause severe privacy leakage even when victim models are facilitated with state-of-the-art defensive strategies.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.44429349899292, 12.283903121948242], "id": 2327, "title": "Do Vision-and-Language Transformers Learn Grounded Predicate-Noun Dependencies?", "authors": "Mitja Nikolaus, Emmanuelle Salin, Stephane Ayache, Abdellah Fourtassi and Benoit Favre", "abstract": "Recent advances in vision-and-language modeling have seen the development of Transformer architectures that achieve remarkable performance on multimodal reasoning tasks.\nYet, the exact capabilities of these black-box models are still poorly understood. While much of previous work has focused on studying their ability to learn meaning at the word-level, their ability to track syntactic dependencies between words has received less attention.\nWe take a first step in closing this gap by creating a new multimodal task targeted at evaluating understanding of predicate-noun dependencies in a controlled setup.\nWe evaluate a range of state-of-the-art models and find that their performance on the task varies considerably, with some models performing relatively well and others at chance level. In an effort to explain this variability, our analyses indicate that the quality (and not only sheer quantity) of pretraining data is essential. Additionally, the best performing models leverage fine-grained multimodal pretraining objectives in addition to the standard image-text matching objectives.\nThis study highlights that targeted and controlled evaluations are a crucial step for a precise and rigorous test of the multimodal knowledge of vision-and-language models.", "track": "Ethic Concerns:Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.466250419616699, 4.917712211608887], "id": 2361, "title": "A Multilingual Perspective Towards the Evaluation of Attribution Methods in Natural Language Inference", "authors": "Kerem Zaman and Yonatan Belinkov", "abstract": "Most evaluations of attribution methods focus on the English language. In this work, we present a multilingual approach for evaluating attribution methods for the Natural Language Inference (NLI) task in terms of faithfulness and plausibility.\nFirst, we introduce a novel cross-lingual strategy to measure faithfulness based on word alignments, which eliminates the drawbacks of erasure-based evaluations.\nWe then perform a comprehensive evaluation of attribution methods, considering different output mechanisms and aggregation methods.\nFinally, we augment the XNLI dataset with highlight-based explanations, providing a multilingual NLI dataset with highlights, to support future exNLP studies. Our results show that attribution methods performing best for plausibility and faithfulness are different.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.924700736999512, 6.284296035766602], "id": 2400, "title": "Graph-Based Multilingual Label Propagation for Low-Resource Part-of-Speech Tagging", "authors": "Ayyoob ImaniGooghari, Silvia Severini, Masoud Jalili Sabet, Fran\u00e7ois Yvon and Hinrich Sch\u00fctze", "abstract": "Part-of-Speech (POS) tagging is an important component of the NLP pipeline, \nbut many low-resource languages lack labeled data for training. An established method for training a POS tagger in such a scenario is to create a labeled training set by transferring from high-resource languages. In this paper, we propose a novel method for transferring labels from multiple high-resource source to low-resource target languages. We formalize POS tag projection as graph-based label propagation. Given translations of a sentence in multiple languages, we create a graph with words as nodes and alignment links as edges by aligning words for all language pairs. We then propagate node labels from source to target using a Graph Neural Network augmented with transformer layers. \nWe show that our propagation creates training sets that allow us to train POS taggers for a diverse set of languages. When combined with enhanced contextualized embeddings, our method achieves a new \nstate-of-the-art for unsupervised POS tagging of low-resource languages.", "track": "Multilinguality", "label": 13}, {"loc": [4.753547191619873, 9.127262115478516], "id": 2430, "title": "SubeventWriter: Iterative Sub-event Sequence Generation with Coherence Controller", "authors": "Zhaowei Wang, Hongming Zhang, Tianqing Fang, Yangqiu Song, Ginny Y. Wong and Simon See", "abstract": "In this paper, we propose a new task of sub-event generation for an unseen process to evaluate the understanding of the coherence of sub-event actions and objects. To solve the problem, we design SubeventWriter, a sub-event sequence generation framework with a coherence controller. Given an unseen process, the framework can iteratively construct the sub-event sequence by generating one sub-event at each iteration. We also design a very effective coherence controller to decode more coherent sub-events. As our extensive experiments and analysis indicate, SubeventWriter can generate more reliable and meaningful sub-event sequences for unseen processes.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.8713059425354, 9.872618675231934], "id": 2432, "title": "Infinite SCAN: An Infinite Model of Diachronic Semantic Change", "authors": "Seiichi Inoue, Mamoru Komachi, Toshinobu Ogiso, Hiroya Takamura and Daichi Mochihashi", "abstract": "In this study, we propose a Bayesian model that can jointly estimate the number of senses of words and their changes through time.\nThe model combines a dynamic topic model on Gaussian Markov random fields with a logistic stick-breaking process that realizes Dirichlet process. \nIn the experiments, we evaluated the proposed model in terms of interpretability, accuracy in estimating the number of senses, and tracking their changes using both artificial data and real data.\nWe quantitatively verified that the model behaves as expected through evaluation using artificial data.\nUsing the CCOHA corpus, we showed that our model outperforms the baseline model and investigated the semantic changes of several well-known target words.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.819373607635498, 8.551979064941406], "id": 2490, "title": "Learning Instructions with Unlabeled Data for Zero-Shot Cross-Task Generalization", "authors": "Yuxian Gu, Pei Ke, Xiaoyan Zhu and Minlie Huang", "abstract": "Training language models to learn from human instructions for zero-shot cross-task generalization has attracted much attention in NLP communities. Recently, instruction tuning (IT), which fine-tunes a pre-trained language model on a massive collection of tasks described via human-craft instructions, has been shown effective in instruction learning for unseen tasks. However, IT relies on a large amount of human-annotated samples, which restricts its generalization. Unlike labeled data, unlabeled data are often massive and cheap to obtain. In this work, we study how IT can be improved with unlabeled data. We first empirically explore the IT performance trends versus the number of labeled data, instructions, and training tasks. We find it critical to enlarge the number of training instructions, and the instructions can be underutilized due to the scarcity of labeled data. Then, we propose Unlabeled Data Augmented Instruction Tuning (UDIT) to take better advantage of the instructions during IT by constructing pseudo-labeled data from unlabeled plain texts. We conduct extensive experiments to show UDIT's effectiveness in various scenarios of tasks and datasets. We also comprehensively analyze the key factors of UDIT to investigate how to better improve IT with unlabeled data. The code is publicly available at https://github.com/thu-coai/UDIT.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.077859878540039, 7.372326850891113], "id": 2525, "title": "Counterfactual Data Augmentation via Perspective Transition for Open-Domain Dialogues", "authors": "Jiao Ou, Jinchao Zhang, Yang Feng and Jie Zhou", "abstract": "The construction of open-domain dialogue systems requires high-quality dialogue datasets. The dialogue data admits a wide variety of responses for a given dialogue history, especially responses with different semantics. However, collecting high-quality such a dataset in most scenarios is labor-intensive and time-consuming. In this paper, we propose a data augmentation method to automatically augment high-quality responses with different semantics by counterfactual inference. Specifically, given an observed dialogue, our counterfactual generation model first infers semantically different responses by replacing the observed reply perspective with substituted ones. Furthermore, our data selection method filters out detrimental augmented responses. Experimental results show that our data augmentation method can augment high-quality responses with different semantics for a given dialogue history, and can outperform competitive baselines on multiple downstream tasks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [0.702044665813446, 6.691017150878906], "id": 2527, "title": "SQUIRE: A Sequence-to-sequence Framework for Multi-hop Knowledge Graph Reasoning", "authors": "Yushi Bai, Xin Lv, Juanzi Li, Lei Hou, Yincen Qu, zelin Dai and Feiyu Xiong", "abstract": "Multi-hop knowledge graph (KG) reasoning has been widely studied in recent years to provide interpretable predictions on missing links with evidential paths. Most previous works use reinforcement learning (RL) based methods that learn to navigate the path towards the target entity. However, these methods suffer from slow and poor convergence, and they may fail to infer a certain path when there is a missing edge along the path. Here we present SQUIRE, the first Sequence-to-sequence based multi-hop reasoning framework, which utilizes an encoder-decoder Transformer structure to translate the query to a path. Our framework brings about two benefits: (1) It can learn and predict in an end-to-end fashion, which gives better and faster convergence; (2) Our transformer model does not rely on existing edges to generate the path, and has the flexibility to complete missing edges along the path, especially in sparse KGs. Experiments on standard and sparse KGs show that our approach yields significant improvement over prior methods, while converging 4x-7x faster.", "track": "Information Extraction", "label": 5}, {"loc": [10.152667999267578, 7.764471054077148], "id": 2551, "title": "SpeechUT: Bridging Speech and Text with Hidden-Unit for Encoder-Decoder Based Speech-Text Pre-training", "authors": "Ziqiang Zhang, Long Zhou, Junyi Ao, Shujie Liu, Lirong Dai, Jinyu Li and Furu Wei", "abstract": "The rapid development of single-modal pre-training has prompted researchers to pay more attention to cross-modal pre-training methods. In this paper, we propose a unified-modal speech-unit-text pre-training model, SpeechUT, to connect the representations of a speech encoder and a text decoder with a shared unit encoder. Leveraging hidden-unit as an interface to align speech and text, we can decompose the speech-to-text model into a speech-to-unit model and a unit-to-text model, which can be jointly pre-trained with unpaired speech and text data respectively. Our proposed SpeechUT is fine-tuned and evaluated on automatic speech recognition (ASR) and speech translation (ST) tasks. Experimental results show that SpeechUT gets substantial improvements over strong baselines, and achieves state-of-the-art performance on both the LibriSpeech ASR and MuST-C ST tasks. To better understand the proposed SpeechUT, detailed analyses are conducted. The code and pre-trained models are available at https://aka.ms/SpeechUT.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.337219715118408, 9.626974105834961], "id": 2647, "title": "Learning Label Modular Prompts for Text Classification in the Wild", "authors": "Hailin Chen, Amrita Saha, Shafiq Joty and Steven C.H. Hoi", "abstract": "Machine learning models usually assume i.i.d data during training and testing, but data and tasks in real world often change over time. To emulate the transient nature of real world, we propose a challenging but practical task: text classification in-the-wild, which introduces different non-stationary training/testing stages. Decomposing a complex task into modular components can enable robust generalisation under such non-stationary environment. However, current modular approaches in NLP do not take advantage of recent advances in parameter efficient tuning of pretrained language models. To close this gap, we propose ModularPrompt, a label-modular prompt tuning framework for text classification tasks. In ModularPrompt, the input prompt consists of a sequence of soft label prompts, each encoding modular knowledge related to the corresponding class label. In two of most formidable settings, ModularPrompt outperforms relevant baselines by a large margin demonstrating strong generalisation ability. We also conduct comprehensive analysis to validate whether the learned prompts satisfy properties of a modular representation.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.282912254333496, 6.642906665802002], "id": 2669, "title": "Unbiased and Efficient Sampling of Dependency Trees", "authors": "Milo\u0161 Stanojevi\u0107", "abstract": "Most computational models of dependency syntax consist of distributions over spanning trees. However, the majority of dependency treebanks require that every valid dependency tree has a single edge coming out of the ROOT node, a constraint that is not part of the definition of spanning trees. For this reason all standard inference algorithms for spanning trees are sub-optimal for inference over dependency trees.\n\nZmigrod et al (2021) proposed algorithms for sampling with and without replacement from the dependency tree distribution that incorporate the single-root constraint. In this paper we show that their fastest algorithm for sampling with replacement, Wilson-RC, is in fact producing biased samples and we provide two alternatives that are unbiased. Additionally, we propose two algorithms (one incremental, one parallel) that reduce the asymptotic runtime of algorithm for sampling k trees without replacement to O(kn^3). These algorithms are both asymptotically and practically more efficient.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [10.159270286560059, 7.069762229919434], "id": 2692, "title": "Continual Learning of Neural Machine Translation within Low Forgetting Risk Regions", "authors": "Shuhao Gu, Bojie Hu and Yang Feng", "abstract": "This paper considers continual learning of large-scale pretrained neural machine translation model without accessing the previous training data or introducing model separation. We argue that the widely used regularization-based methods, which perform multi-objective learning with an auxiliary loss, suffer from the misestimate problem and cannot always achieve a good balance between the previous and new tasks. To solve the problem, we propose a two-stage training method based on the local features of the real loss. We first search low forgetting risk regions, where the model can retain the performance on the previous task as the parameters are updated, to avoid the catastrophic forgetting problem. Then we can continually train the model within this region only with the new training data to fit the new task. Specifically, we propose two methods to search the low forgetting risk regions, which are based on the curvature of loss and the impacts of the parameters on the model output, respectively. We conduct experiments on domain adaptation and more challenging language adaptation tasks, and the experimental results show that our method can achieve significant improvements compared with several strong baselines.", "track": "Machine Translation", "label": 10}, {"loc": [8.888801574707031, 8.184469223022461], "id": 2741, "title": "COST-EFF: Collaborative Optimization of Spatial and Temporal Efficiency with Slenderized Multi-exit Language Models", "authors": "Bowen Shen, Zheng Lin, Yuanxin LIU, Zhengxiao Liu, Lei Wang and Weiping Wang", "abstract": "Transformer-based pre-trained language models (PLMs) mostly suffer from excessive overhead despite their advanced capacity. For resource-constrained devices, there is an urgent need for a spatially and temporally efficient model which retains the major capacity of PLMs. However, existing statically compressed models are unaware of the diverse complexities between input instances, potentially resulting in redundancy and inadequacy for simple and complex inputs. Also, miniature models with early exiting encounter challenges in the trade-off between making predictions and serving the deeper layers. Motivated by such considerations, we propose a collaborative optimization for PLMs that integrates static model compression and dynamic inference acceleration. Specifically, the PLM is slenderized in width while the depth remains intact, complementing layer-wise early exiting to speed up inference dynamically. To address the trade-off of early exiting, we propose a joint training approach that calibrates slenderization and preserves contributive structures to each exit instead of only the final layer. Experiments are conducted on GLUE benchmark and the results verify the Pareto optimality of our approach at high compression and acceleration rate with 1/8 parameters and 1/19 FLOPs of BERT.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [0.7676991820335388, 7.998960018157959], "id": 2794, "title": "Rescue Implicit and Long-tail Cases: Nearest Neighbor Relation Extraction", "authors": "Zhen Wan, Qianying Liu, Zhuoyuan Mao, Fei Cheng, Sadao Kurohashi and Jiwei Li", "abstract": "Relation extraction (RE) has achieved remarkable progress with the help of pre-trained language models. However, existing RE models are usually incapable of handling two situations: implicit expressions and long-tail relation types, caused by language complexity and data sparsity. In this paper, we introduce a simple enhancement of RE using $k$ nearest neighbors ($k$NN-RE). $k$NN-RE allows the model to consult training relations at test time through a nearest-neighbor search and provides a simple yet effective means to tackle the two issues above. Additionally, we observe that $k$NN-RE serves as an effective way to leverage distant supervision (DS) data for RE. Experimental results show that the proposed $k$NN-RE achieves state-of-the-art performances on a variety of supervised RE datasets, i.e., ACE05, SciERC, and Wiki80, along with outperforming the best model to date on the i2b2 and Wiki80 datasets in the setting of allowing using DS. Our code and models are available at: https://github.com/YukinoWan/kNN-RE.", "track": "Information Extraction", "label": 5}, {"loc": [4.661730766296387, 9.162493705749512], "id": 2802, "title": "StoryER: Automatic Story Evaluation via Ranking, Rating and Reasoning", "authors": "Hong Chen, Duc Vo, Hiroya Takamura, Yusuke Miyao and Hideki Nakayama", "abstract": "Existing automatic story evaluation methods place a premium on story lexical level coherence, deviating from human preference.\nWe go beyond this limitation by considering a novel Story Evaluation method that mimics human preference when judging a story, namely StoryER, which consists of three sub-tasks: Ranking, Rating and Reasoning.\nGiven either a machine-generated or a human-written story, StoryER requires the machine to output 1) a preference score that corresponds to human preference, 2) specific ratings and their corresponding confidences and 3) comments for various aspects (e.g., opening, character-shaping).\nTo support these tasks, we introduce a well-annotated dataset comprising (i) 100k ranked story pairs; and (ii) a set of 46k ratings and comments on various aspects of the story.\nWe finetune Longformer-Encoder-Decoder (LED) on the collected dataset, with the encoder responsible for preference score and aspect prediction and the decoder for comment generation.\nOur comprehensive experiments result a competitive benchmark for each task, showing the high correlation to human preference.\nIn addition, we have witnessed the joint learning of the preference scores, the aspect ratings, and the comments brings gain each single task.\nOur dataset and benchmarks are publicly available to advance the research of story evaluation tasks.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.994086265563965, 4.318916320800781], "id": 2807, "title": "Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference", "authors": "Eric A. Mitchell, Joseph Noh, Siyan Li, Will Armstrong, Ananth Agarwal, Patrick Liu, Chelsea B. Finn and Christopher D. Manning", "abstract": "While large pre-trained language models are powerful, their predictions often lack logical consistency across test inputs. For example, a state-of-the-art Macaw question-answering (QA) model answers Yes to Is a sparrow a bird? and Does a bird have feet? but answers No to Does a sparrow have feet?. To address this failure mode, we propose a framework, Consistency Correction through Relation Detection, or ConCoRD, for boosting the consistency and accuracy of pre-trained NLP models using pre-trained natural language inference (NLI) models without fine-tuning or re-training. Given a batch of test inputs, ConCoRD samples several candidate outputs for each input and instantiates a factor graph that accounts for both the model's belief about the likelihood of each answer choice in isolation and the NLI model's beliefs about pair-wise answer choice compatibility. We show that a weighted MaxSAT solver can efficiently compute high-quality answer choices under this factor graph, improving over the raw model's predictions. Our experiments demonstrate that ConCoRD consistently boosts accuracy and consistency of off-the-shelf closed-book QA and VQA models using off-the-shelf NLI models, notably increasing accuracy of LXMERT on ConVQA by 5% absolute. See the project website (https://ericmitchell.ai/emnlp-2022-concord/) for code and data.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [7.596951961517334, 9.060623168945312], "id": 2827, "title": "Robustness of Demonstration-based Learning Under Limited Data Scenario", "authors": "Hongxin Zhang, Yanzhe Zhang, Ruiyi Zhang and Diyi Yang", "abstract": "Demonstration-based learning has shown great potential in stimulating pretrained language models' ability under limited data scenario. Simply augmenting the input with some demonstrations can significantly improve performance on few-shot NER. However, why such demonstrations are beneficial for the learning process remains unclear since there is no explicit alignment between the demonstrations and the predictions. In this paper, we design pathological demonstrations by gradually removing intuitively useful information from the standard ones to take a deep dive of the robustness of demonstration-based sequence labeling and show that (1) demonstrations composed of random tokens still make the model a better few-shot learner; (2) the length of random demonstrations and the relevance of random tokens are the main factors affecting the performance; (3) demonstrations increase the confidence of model predictions on captured superficial patterns. We have publicly released our code at https://github.com/SALT-NLP/RobustDemo.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [6.362502098083496, 7.5349955558776855], "id": 2843, "title": "Modeling Information Change in Science Communication with Semantically Matched Paraphrases", "authors": "Dustin Wright, Jiaxin Pei, David Jurgens and Isabelle Augenstein", "abstract": "Whether the media faithfully communicate scientific information has long been a core issue to the science community. Automatically identifying paraphrased scientific findings could enable large-scale tracking and analysis of information changes in the science communication process, but this requires systems to understand the similarity between scientific information across multiple domains. To this end, we present the SCIENTIFIC PARAPHRASE AND INFORMATION CHANGE DATASET (SPICED), the first paraphrase dataset of scientific findings annotated for degree of information change. SPICED contains 6,000 scientific finding pairs extracted from news stories, social media discussions, and full texts of original papers. We demonstrate that SPICED poses a challenging task and that models trained on SPICED improve downstream performance on evidence retrieval for fact checking of real-world scientific claims. Finally, we show that models trained on SPICED can reveal large-scale trends in the degrees to which people and organizations faithfully communicate new scientific findings. Data, code, and pre-trained models are available at http://www.copenlu.com/publication/2022_emnlp_wright/.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [7.068703651428223, 6.371949195861816], "id": 2866, "title": "Word Order Matters When You Increase Masking", "authors": "Karim Lasri, Alessandro Lenci and Thierry Poibeau", "abstract": "Word order, an essential property of natural languages, is injected in Transformer-based neural language models using position encoding. However, recent experiments have shown that explicit position encoding is not always useful, since some models without such feature managed to achieve state-of-the art performance on some tasks. To understand better this phenomenon, we examine the effect of removing position encodings on the pre-training objective itself (i.e., masked language modelling), to test whether models can reconstruct position information from co-occurrences alone. We do so by controlling the amount of masked tokens in the input sentence, as a proxy to affect the importance of position information for the task. We find that the necessity of position information increases with the amount of masking, and that masked language models without position encodings are not able to reconstruct this information on the task. These findings point towards a direct relationship between the amount of masking and the ability of Transformers to capture order-sensitive aspects of language using position encoding.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.898085594177246, 3.3622450828552246], "id": 2902, "title": "An Empirical Analysis of Memorization in Fine-tuned Autoregressive Language Models", "authors": "Fatemehsadat Mireshghallah, Archit Uniyal, Tianhao Wang, David Evans and Taylor Berg-Kirkpatrick", "abstract": "Large language models are shown to present privacy risks through memorization of training data, and\nseveral recent works have studied such risks for the pre-training phase. Little attention, however, has been given to the fine-tuning phase and it is not well understood how different fine-tuning methods (such as fine-tuning the full model, the model head, and adapter) compare in terms of memorization risk. This presents increasing concern as the ``pre-train and fine-tune'' paradigm proliferates. In this paper, we empirically study memorization of fine-tuning methods using membership inference and extraction attacks, and show that their susceptibility to attacks is very different. We observe that fine-tuning the head of the model has the highest susceptibility to attacks, whereas fine-tuning smaller adapters appears to be less vulnerable to known extraction attacks.", "track": "Ethics", "label": 21}, {"loc": [1.7898797988891602, 9.129376411437988], "id": 2980, "title": "Style Transfer as Data Augmentation: A Case Study on Named Entity Recognition", "authors": "Shuguang Chen, Leonardo Neves and Thamar Solorio", "abstract": "In this work, we take the named entity recognition task in the English language as a case study and explore style transfer as a data augmentation method to increase the size and diversity of training data in low-resource scenarios. We propose a new method to effectively transform the text from a high-resource domain to a low-resource domain by changing its style-related attributes to generate synthetic data for training. Moreover, we design a constrained decoding algorithm along with a set of key ingredients for data selection to guarantee the generation of valid and coherent data. Experiments and analysis on five different domain pairs under different data regimes demonstrate that our approach can significantly improve results compared to current state-of-the-art data augmentation methods. Our approach is a practical solution to data scarcity, and we expect it to be applicable to other NLP tasks.", "track": "Information Extraction", "label": 5}, {"loc": [6.037842273712158, 8.422521591186523], "id": 3005, "title": "Linguistic Corpus Annotation for Automatic Text Simplification Evaluation", "authors": "R\u00e9mi Cardon, Adrien Bibal, Rodrigo Wilkens, David Alfter, Magali Norr\u00e9, Adeline M\u00fcller, Patrick Watrin and Thomas Fran\u00e7ois", "abstract": "Evaluating automatic text simplification (ATS) systems is a difficult task that is either performed by automatic metrics or user-based evaluations. However, from a linguistic point-of-view, it is not always clear on what bases these evaluations operate. In this paper, we propose annotations of the ASSET corpus that can be used to shed more light on ATS evaluation. In addition to contributing with this resource, we show how it can be used to analyze SARI's behavior and to re-evaluate existing ATS systems. We present our insights as a step to improve ATS evaluation protocols in the future.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.7345057725906372, 6.453988552093506], "id": 3014, "title": "Semantic Framework based Query Generation for Temporal Question Answering over Knowledge Graphs", "authors": "Wentao Ding, Hao Chen, Huayu Li and Yuzhong Qu", "abstract": "Answering factual questions with temporal intent over knowledge graphs (temporal KGQA) attracts rising attention in recent years.\nIn the generation of temporal queries, existing KGQA methods ignore the fact that some intrinsic connections between events can make them temporally related, which may limit their capability.\nWe systematically analyze the possible interpretation of temporal constraints and conclude the interpretation structures as the Semantic Framework of Temporal Constraints, SF-TCons.\nBased on the semantic framework, we propose a temporal question answering method, SF-TQA, which generates query graphs by exploring the relevant facts of mentioned entities, where the exploring process is restricted by SF-TCons. \nOur evaluations show that SF-TQA significantly outperforms existing methods on two benchmarks over different knowledge graphs.", "track": "Question Answering", "label": 11}, {"loc": [4.241262912750244, 7.099575519561768], "id": 3039, "title": "There Is No Standard Answer: Knowledge-Grounded Dialogue Generation with Adversarial Activated Multi-Reference Learning", "authors": "Xueliang Zhao, Tingchen Fu, Chongyang Tao and Rui Yan", "abstract": "Knowledge-grounded dialogue (KGC) shows excellent potential to deliver an engaging and informative response. However, existing approaches emphasize selecting one golden knowledge given a particular dialogue context, overlooking the one-to-many phenomenon in dialogue. As a result, existing paradigm limits the diversity of knowledge selection and generation. To this end, we establish a multi-reference KGC dataset and propose a series of metrics to systematically assess the one-to-many efficacy of existing KGC models. Furthermore, to extend the hypothesis space of knowledge selection to enhance the mapping relationship between multiple knowledge and multiple responses, we devise a span-based variational model and optimize the model in a wake-sleep style with an ameliorated evidence lower bound objective to learn the one-to-many generalization. Both automatic and human evaluations demonstrate the efficacy of our approach.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.774575710296631, 5.589908599853516], "id": 3079, "title": "Stop Measuring Calibration When Humans Disagree", "authors": "Joris Baan, Wilker Aziz, Barbara Plank and Raquel Fern\u00e1ndez", "abstract": "Calibration is a popular framework to evaluate whether a classifier knows when it does not know - i.e., its predictive probabilities are a good indication of how likely a prediction is to be correct. Correctness is commonly estimated against the human majority class. Recently, calibration to human majority has been measured on tasks where humans inherently disagree about which class applies. We show that measuring calibration to human majority given inherent disagreements is theoretically problematic, demonstrate this empirically on the ChaosNLI dataset, and derive several instance-level measures of calibration that capture key statistical properties of human judgements - including class frequency, ranking and entropy.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [3.9106900691986084, 4.222187042236328], "id": 3107, "title": "Improving compositional generalization for multi-step quantitative reasoning in question answering", "authors": "Armineh Nourbakhsh, Cathy Jiao, Sameena Shah and Carolyn Ros\u00e9", "abstract": "Quantitative reasoning is an important aspect of question answering, especially when numeric and verbal cues interact to indicate sophisticated, multi-step programs. In this paper, we demonstrate how modeling the compositional nature of quantitative text can enhance the performance and robustness of QA models, allowing them to capture arithmetic logic that is expressed verbally. Borrowing from the literature on semantic parsing, we propose a method that encourages the QA models to adjust their attention patterns and capture input/output alignments that are meaningful to the reasoning task. We show how this strategy improves program accuracy and renders the models more robust against overfitting as the number of reasoning steps grows. Our approach is designed as a standalone module which can be prepended to many existing models and trained in an end-to-end fashion without the need for additional supervisory signal. As part of this exercise, we also create a unified dataset building on four previously released numerical QA datasets over tabular data.", "track": "Question Answering", "label": 11}, {"loc": [7.016415596008301, 6.321554660797119], "id": 3114, "title": "A Comprehensive Comparison of Neural Networks as Cognitive Models of Inflection", "authors": "Adam Wiemerslage, Shiran Dudy and Katharina Kann", "abstract": "Neural networks have long been at the center of a debate around the cognitive mechanism by which humans process inflectional morphology. This debate has gravitated into NLP by way of the question: Are neural networks a feasible account for human behavior in morphological inflection?\nWe address that question by measuring the correlation between human judgments and neural network probabilities for unknown word inflections. We test a larger range of architectures than previously studied on two important tasks for the cognitive processing debate: English past tense, and German number inflection. We find evidence that the Transformer may be a better account of human behavior than LSTMs on these datasets, and that LSTM features known to increase inflection accuracy do not always result in more human-like behavior.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [9.819525718688965, 7.841202259063721], "id": 3124, "title": "Can Visual Context Improve Automatic Speech Recognition for an Embodied Agent?", "authors": "Pradip Pramanick and Chayan Sarkar", "abstract": "The usage of automatic speech recognition (ASR) systems are becoming omnipresent ranging from personal assistant to chatbots, home, and industrial automation systems, etc. Modern robots are also equipped with ASR capabilities for interacting with humans as speech is the most natural interaction modality. However, ASR in robots faces additional challenges as compared to a personal assistant. Being an embodied agent, a robot must recognize the physical entities around it and therefore reliably recognize the speech containing the description of such entities. However, current ASR systems are often unable to do so due to limitations in ASR training, such as generic datasets and open-vocabulary modeling. Also, adverse conditions during inference, such as noise, accented, and far-field speech makes the transcription inaccurate. In this work, we present a method to incorporate a robot's visual information into an ASR system and improve the recognition of a spoken utterance containing a visible entity. Specifically, we propose a new decoder biasing technique to incorporate the visual context while ensuring the ASR output does not degrade for incorrect context. We achieve a 59% relative reduction in WER from an unmodified ASR system.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [9.098329544067383, 6.429450988769531], "id": 3165, "title": "AfroLID: A Neural Language Identification Tool for African Languages", "authors": "Ife Adebara, AbdelRahim Elmadany, Muhammad Abdul-Mageed and Alcides Alcoba Inciarte", "abstract": "Language identification (LID) is a crucial precursor for NLP, especially for mining web data. Problematically, most of the world's 7000+ languages today are not covered by LID technologies. We address this pressing issue for Africa by introducing AfroLID, a neural LID toolkit for 517 African languages and varieties. AfroLID exploits a multi-domain web dataset manually curated from across 14 language families utilizing five orthographic systems. When evaluated on our blind Test set, AfroLID achieves 95.89 F_1-score. We also compare AfroLID to five existing LID tools that each cover a small number of African languages, finding it to outperform them on most languages. We further show the utility of AfroLID in the wild by testing it on the acutely under-served Twitter domain. Finally, we offer a number of controlled case studies and perform a linguistically-motivated error analysis that allow us to both showcase AfroLID's powerful capabilities and limitations\\footnote{AfroLID is publicly available at \\href{https://github.com/UBC-NLP/afrolid}{https://github.com/UBC-NLP/afrolid}.\n}", "track": "Multilinguality", "label": 13}, {"loc": [2.303135633468628, 7.357325077056885], "id": 3270, "title": "EvEntS ReaLM: Event Reasoning of Entity States via Language Models", "authors": "Evangelia Spiliopoulou, Artidoro Pagnoni, Yonatan Bisk and Eduard Hovy", "abstract": "This paper investigates models of event implications. Specifically, how well models predict entity state-changes, by targeting their understanding of physical attributes. Nominally, Large Language models (LLM) have been exposed to procedural knowledge about how objects interact, yet our benchmarking shows they fail to reason about the world. Conversely, we also demonstrate that existing approaches often misrepresent the surprising abilities of LLMs via improper task encodings and that proper model prompting can dramatically improve performance of reported baseline results across multiple tasks. In particular, our results indicate that our prompting technique is especially useful for unseen attributes (out-of-domain) or when only limited data is available.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [2.5373737812042236, 8.691519737243652], "id": 3279, "title": "Large language models are few-shot clinical information extractors", "authors": "Monica Agrawal, Stefan Hegselmann, Hunter Lang, Yoon Kim and David Sontag", "abstract": "A long-running goal of the clinical NLP community is the extraction of important variables trapped in clinical notes. However, roadblocks have included dataset shift from the general domain and a lack of public clinical corpora and annotations. In this work, we show that large language models, such as InstructGPT (Ouyang et al., 2022), perform well at zero- and few-shot information extraction from clinical text despite not being trained specifically for the clinical domain. Whereas text classification and generation performance have already been studied extensively in such models, here we additionally demonstrate how to leverage them to tackle a diverse set of NLP tasks which require more structured outputs, including span identification, token-level sequence classification, and relation extraction. Further, due to the dearth of available data to evaluate these systems, we introduce new datasets for benchmarking few-shot clinical information extraction based on a manual re-annotation of the CASI dataset (Moon et al., 2014) for new tasks. On the clinical extraction tasks we studied, the GPT-3 systems significantly outperform existing zero- and few-shot baselines.", "track": "NLP Applications", "label": 0}, {"loc": [5.423282146453857, 8.458403587341309], "id": 3285, "title": "Towards a Unified Multi-Dimensional Evaluator for Text Generation", "authors": "Ming Zhong, Yang Liu, Da Yin, Yuning Mao, Yizhu Jiao, Pengfei Liu, Chenguang Zhu, Heng Ji and Jiawei Han", "abstract": "Multi-dimensional evaluation is the dominant paradigm for human evaluation in Natural Language Generation (NLG), i.e., evaluating the generated text from multiple explainable dimensions, such as coherence and fluency. However, automatic evaluation in NLG is still dominated by similarity-based metrics, and we lack a reliable framework for a more comprehensive evaluation of advanced models. In this paper, we propose a unified multi-dimensional evaluator UniEval for NLG. We re-frame NLG evaluation as a Boolean Question Answering (QA) task, and by guiding the model with different questions, we can use one evaluator to evaluate from multiple dimensions. Furthermore, thanks to the unified Boolean QA format, we are able to introduce an intermediate learning phase that enables UniEval to incorporate external knowledge from multiple related tasks and gain further improvement. Experiments on three typical NLG tasks show that UniEval correlates substantially better with human judgments than existing metrics. Specifically, compared to the top-performing unified evaluators, UniEval achieves a 23% higher correlation on text summarization, and over 43% on dialogue response generation. Also, UniEval demonstrates a strong zero-shot learning ability for unseen evaluation dimensions and tasks. Source code, data, and all pre-trained evaluators are available at https://github.com/maszhongming/UniEval.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.924619674682617, 3.515523910522461], "id": 3331, "title": "GeoMLAMA: Geo-Diverse Commonsense Probing on Multilingual Pre-Trained Language Models", "authors": "Da Yin, Hritik Bansal, Masoud Monajatipoor, Liunian Harold Li and Kai-Wei Chang", "abstract": "Recent work has shown that Pre-trained Language Models (PLMs) store the relational knowledge learned from data and utilize it for performing downstream tasks. However, commonsense knowledge across different regions may vary. For instance, the color of bridal dress is white in American weddings whereas it is red in Chinese weddings. In this paper, we introduce a benchmark dataset, Geo-diverse Commonsense Multilingual Language Models Analysis (GeoMLAMA), for probing the diversity of the relational knowledge in multilingual PLMs. GeoMLAMA contains 3125 prompts in English, Chinese, Hindi, Persian, and Swahili, with a wide coverage of concepts shared by people from American, Chinese, Indian, Iranian and Kenyan cultures. We benchmark 11 standard multilingual PLMs on GeoMLAMA. Interestingly, we find that 1) larger multilingual PLMs variants do not necessarily store geo-diverse concepts better than its smaller variant; 2) multilingual PLMs are not intrinsically biased towards knowledge from the Western countries (the United States); 3) the native language of a country may not be the best language to probe its knowledge and 4) a language may better probe knowledge about a non-native country than its native country.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [9.025883674621582, 6.288629531860352], "id": 3370, "title": "The (Undesired) Attenuation of Human Biases by Multilinguality", "authors": "Cristina Espa\u00f1a-Bonet and Alberto Barr\u00f3n-Cede\u00f1o", "abstract": "Some human preferences are universal. The odor of vanilla is perceived as pleasant all around the world. We expect neural models trained on human texts to exhibit these kind of preferences, i.e. biases, but we show that this is not always the case. We explore 16 static and contextual embedding models in 9 languages and, when possible, compare them under similar training conditions. We introduce and release CA-WEAT, multilingual cultural aware tests to quantify biases, and compare them to previous English-centric tests. Our experiments confirm that monolingual static embeddings do exhibit human biases, but values differ across languages, being far from universal. Biases are less evident in contextual models, to the point that the original human association might be reversed. Multilinguality proves to be another variable that attenuates and even reverses the effect of the bias, specially in contextual multilingual models. In order to explain this variance among models and languages, we examine the effect of asymmetries in the training corpus, departures from isomorphism in multilingual embedding spaces and discrepancies in the testing measures between languages.", "track": "Multilinguality", "label": 13}, {"loc": [4.161087989807129, 4.456340789794922], "id": 3374, "title": "Entailer: Answering Questions with Faithful and Truthful Chains of Reasoning", "authors": "Oyvind Tafjord, Bhavana Dalvi Mishra and Peter Clark", "abstract": "Our goal is a question-answering (QA) system that can show how its answers are implied by its own internal beliefs via a systematic chain of reasoning. Such a capability would allow better understanding of why a model produced the answer it did. Our approach is to recursively combine a trained backward-chaining\nmodel, capable of generating a set of premises entailing an answer hypothesis, with a verifier that checks that the model itself believes those premises (and the entailment itself) through self-querying. To our knowledge, this is the first system to generate multistep chains that are both faithful (the answer follows from the reasoning) and truthful (the chain reflects the system's own internal beliefs). In evaluation using two different datasets, users judge that a majority (70%+) of generated chains clearly show how an answer follows from a set of facts - substantially better than a high-performance baseline - while preserving answer accuracy. By materializing model beliefs that systematically support an answer, new opportunities arise for understanding the model's system of belief, and diagnosing and correcting its misunderstandings when an answer is wrong.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.532333850860596, 8.517032623291016], "id": 3389, "title": "Near-Negative Distinction: Giving a Second Life to Human Evaluation Datasets", "authors": "Philippe Laban, Chien-Sheng Wu, Wenhao Liu and Caiming Xiong", "abstract": "Precisely assessing the progress in natural language generation (NLG) tasks is challenging, and human evaluation to establish a preference in a model's output over another is often necessary.\nHowever, human evaluation is usually costly, difficult to reproduce, and non-reusable.\nIn this paper, we propose a new and simple automatic evaluation method for NLG called Near-Negative Distinction (NND) that repurposes prior human annotations into NND tests.\nIn an NND test, an NLG model must place a higher likelihood on a high-quality output candidate than on a near-negative candidate with a known error.\nModel performance is established by the number of NND tests a model passes, as well as the distribution over task-specific errors the model fails on.\nThrough experiments on three NLG tasks (question generation, question answering, and summarization), we show that NND achieves a higher correlation with human judgments than standard NLG evaluation metrics. We then illustrate NND evaluation in four practical scenarios, for example performing fine-grain model analysis, or studying model training dynamics. Our findings suggest that NND can give a second life to human annotations and provide low-cost NLG evaluation.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.588822841644287, 12.390138626098633], "id": 3425, "title": "ToKen: Task Decomposition and Knowledge Infusion for Few-Shot Hate Speech Detection", "authors": "Badr AlKhamissi, Faisal Ladhak, Srinivasan Iyer, Veselin Stoyanov, Zornitsa Kozareva, Xian Li, Pascale Fung, Lambert Mathias, Asli Celikyilmaz and Mona Diab", "abstract": "Hate speech detection is complex; it relies on commonsense reasoning, knowledge of stereotypes, and an understanding of social nuance that differs from one culture to the next. It is also difficult to collect a large-scale hate speech annotated dataset. In this work, we frame this problem as a few-shot learning task, and show significant gains with decomposing the task into its \"constituent\" parts. In addition, we see that infusing knowledge from reasoning datasets (e.g. ATOMIC2020) improves the performance even further. Moreover, we observe that the trained models generalize to out-of-distribution datasets, showing the superiority of task decomposition and knowledge infusion compared to previously used methods. Concretely, our method outperforms the baseline by 17.83% absolute gain in the 16-shot case.", "track": "NLP Applications", "label": 0}, {"loc": [5.42059850692749, 5.067257881164551], "id": 3434, "title": "Are Hard Examples also Harder to Explain? A Study with Human and Model-Generated Explanations", "authors": "Swarnadeep Saha, Peter Hase, Nazneen Rajani and Mohit Bansal", "abstract": "Recent work on explainable NLP has shown that few-shot prompting can enable large pre-trained language models (LLMs) to generate grammatical and factual natural language explanations for data labels. In this work, we study the connection between explainability and sample hardness by investigating the following research question -- \"Are LLMs and humans equally good at explaining data labels for both easy and hard samples?\" We answer this question by first collecting human-written explanations in the form of generalizable commonsense rules on the task of Winograd Schema Challenge (Winogrande dataset). We compare these explanations with those generated by GPT-3 while varying the hardness of the test samples as well as the in-context samples. We observe that (1) GPT-3 explanations are as grammatical as human explanations regardless of the hardness of the test samples, (2) for easy examples, GPT-3 generates highly supportive explanations but human explanations are more generalizable, and (3) for hard examples, human explanations are significantly better than GPT-3 explanations both in terms of label-supportiveness and generalizability judgements. We also find that hardness of the in-context examples impacts the quality of GPT-3 explanations. Finally, we show that the supportiveness and generalizability aspects of human explanations are also impacted by sample hardness, although by a much smaller margin than models.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.569742202758789, 5.574893474578857], "id": 3463, "title": "Stanceosaurus: Classifying Stance Towards Multicultural Misinformation", "authors": "Jonathan Qiaoyi Zheng, Ashutosh Baheti, Tarek Naous, Wei Xu and Alan Ritter", "abstract": "We present Stanceosaurus, a new corpus of 28,033 tweets in English, Hindi and Arabic annotated with stance towards 250 misinformation claims. As far as we are aware, it is the largest corpus annotated with stance towards misinformation claims. The claims in Stanceosaurus originate from 15 fact-checking sources that cover diverse geographical regions and cultures. Unlike existing stance datasets, we introduce a more fine-grained 5-class labeling strategy with additional subcategories to distinguish implicit stance. Pre-trained transformer-based stance classifiers that are fine-tuned on our corpus show good generalization on unseen claims and regional claims from countries outside the training data. Cross-lingual experiments demonstrate Stanceosaurus' capability of training multilingual models, achieving 53.1 F1 on Hindi and 50.4 F1 on Arabic without any target-language fine-tuning. Finally, we show how a domain adaptation method can be used to improve performance on Stanceosaurus using additional RumourEval-2019 data. We will make Stanceosaurus publicly available to the research community upon publication and hope it will encourage further work on misinformation identification across languages and cultures.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.055968284606934, 5.8220953941345215], "id": 3465, "title": "Gendered Mental Health Stigma in Masked Language Models", "authors": "Wanyin Lin, Lucille Njoo, Anjalie Field, Ashish Sharma, Katharina Reinecke, Tim Althoff and Yulia Tsvetkov", "abstract": "Mental health stigma prevents many individuals from receiving the appropriate care, and social psychology studies have shown that mental health tends to be overlooked in men. In this work, we investigate gendered mental health stigma in masked language models. In doing so, we operationalize mental health stigma by developing a framework grounded in psychology research: we use clinical psychology literature to curate prompts, then evaluate the models' propensity to generate gendered words. We find that masked language models capture societal stigma about gender in mental health: models are consistently more likely to predict female subjects than male in sentences about having a mental health condition (32% vs. 19%), and this disparity is exacerbated for sentences that indicate treatment-seeking behavior. Furthermore, we find that different models capture dimensions of stigma differently for men and women, associating stereotypes like anger, blame, and pity more with women with mental health conditions than with men. In showing the complex nuances of models' gendered mental health stigma, we demonstrate that context and overlapping dimensions of identity are important considerations when assessing computational models' social biases.", "track": "Ethics", "label": 21}, {"loc": [1.7344635725021362, 3.818256139755249], "id": 3496, "title": "Efficient Nearest Neighbor Search for Cross-Encoder Models using Matrix Factorization", "authors": "Nishant Yadav, Nicholas Monath, Rico Angell, Manzil Zaheer and Andrew McCallum", "abstract": "Efficient k-nearest neighbor search is a fundamental task, foundational for many problems in NLP. When the similarity is measured by dot-product between dual-encoder vectors or L2-distance, there already exist many scalable and efficient search methods. But not so when similarity is measured by more accurate and expensive black-box neural similarity models, such as cross-encoders, which jointly encode the query and candidate neighbor. The cross-encoders' high computational cost typically limits their use to reranking candidates retrieved by a cheaper model, such as dual encoder or TF-IDF. However, the accuracy of such a two-stage approach is upper-bounded by the recall of the initial candidate set, and potentially requires additional training to align the auxiliary retrieval model with the cross-encoder model. In this paper, we present an approach that avoids the use of a dual-encoder for retrieval, relying solely on the cross-encoder. Retrieval is made efficient with CUR decomposition, a matrix decomposition approach that approximates all pairwise cross-encoder distances from a small subset of rows and columns of the distance matrix. Indexing items using our approach is computationally cheaper than training an auxiliary dual-encoder model through distillation. Empirically, for k > 10, our approach provides test-time recall-vs-computational cost trade-offs superior to the current widely-used methods that re-rank items retrieved using a dual-encoder or TF-IDF.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.928123950958252, 9.137007713317871], "id": 3501, "title": "Prompt-and-Rerank: A Method for Zero-Shot and Few-Shot Arbitrary Textual Style Transfer with Small Language Models", "authors": "Mirac Suzgun, Luke Melas-Kyriazi and Dan Jurafsky", "abstract": "We propose a method for arbitrary textual style transfer (TST)\u2014the task of transforming a text into any given style\u2014utilizing general-purpose pre-trained language models. Our method, Prompt-and-Rerank, is based on a mathematical formulation of the TST task, decomposing it into three constituent components: textual similarity, target style strength, and fluency. Our method uses zero-shot or few-shot prompting to obtain a set of candidate generations in the target style, and then re-ranks them according to the three components. Our method enables small pre-trained language models to perform on par with state-of-the-art large-scale models while using two orders of magnitude less compute and memory. We also investigate the effect of model size and prompt design (e.g., prompt paraphrasing and delimiter-pair choice) on style transfer quality across seven diverse textual style transfer datasets, finding, among other things, that delimiter-pair choice has a large impact on performance, and that models have biases on the direction of style transfer.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.2644262313842773, 4.88023042678833], "id": 3523, "title": "Learning to Decompose: Hypothetical Question Decomposition Based on Comparable Texts", "authors": "Ben Zhou, Kyle Richardson, Xiaodong Yu and Dan Roth", "abstract": "Explicit decomposition modeling, which involves breaking down complex tasks into more straightforward and often more interpretable sub-tasks, has long been a central theme in developing robust and interpretable NLU systems. However, despite the many datasets and resources built as part of this effort, the majority have small-scale annotations and limited scope, which is insufficient to solve general decomposition tasks. In this paper, we look at large-scale intermediate pre-training of decomposition-based transformers using distant supervision from comparable texts, particularly large-scale parallel news. We show that with such intermediate pre-training, developing robust decomposition-based models for a diverse range of tasks becomes more feasible. For example, on semantic parsing, our model, DecompT5, improves 20% to 30% on two datasets, Overnight and TORQUE, over the baseline language model. We further use DecompT5 to build a novel decomposition-based QA system named DecompEntail, improving over state-of-the-art models, including GPT-3, on both HotpotQA and StrategyQA by 8% and 4%, respectively.", "track": "Question Answering", "label": 11}, {"loc": [5.273123264312744, 12.32708740234375], "id": 3541, "title": "Why is Winoground Hard? Investigating Failures in Visuolinguistic Compositionality", "authors": "Anuj Diwan, Layne Berry, Eunsol Choi, David Harwath and Kyle Mahowald", "abstract": "Recent visuolinguistic pre-trained models show promising progress on various end tasks such as image retrieval and video captioning. Yet, they fail miserably on the recently proposed Winoground dataset, which challenges models to match paired images and English captions, with items constructed to overlap lexically but differ in meaning (e.g., \"there is a mug in some grass\" vs. \"there is some grass in a mug\"). By annotating the dataset using new fine-grained tags, we show that solving the Winoground task requires not just compositional language understanding, but a host of other abilities like commonsense reasoning or locating small, out-of-focus objects in low-resolution images. In this paper, we identify the dataset's main challenges through a suite of experiments on related tasks (probing task, image retrieval task), data augmentation, and manual inspection of the dataset. Our analysis suggests that a main challenge in visuolinguistic models may lie in fusing visual and textual representations, rather than in compositional language understanding. We release our annotation and code at https://github.com/ajd12342/why-winoground-hard.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.912754058837891, 8.834760665893555], "id": 3614, "title": "Gradient-based Constrained Sampling from Language Models", "authors": "Sachin Kumar, Biswajit Paria and Yulia Tsvetkov", "abstract": "Large pretrained language models are successful at generating fluent text but are notoriously hard to controllably sample from. In this work, we study constrained sampling from such language models, i.e., generating text that satisfies user-defined constraints, while maintaining fluency and model's performance in a downstream task. \nWe propose MuCoLa---a sampling procedure that combines the log-likelihood of the language model with arbitrary (differentiable) constraints in a single energy function, and then generates samples in a non-autoregressive manner. \nSpecifically, it initializes the entire output sequence with noise and follows a Markov chain defined by Langevin Dynamics using the gradients of this energy. \nWe evaluate MuCoLa on text generation with soft and hard constraints as well as their combinations, obtaining significant improvements over competitive baselines for toxicity avoidance, sentiment control, and keyword-guided generation.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.9820315837860107, 4.211977958679199], "id": 3617, "title": "TaCube: Pre-computing Data Cubes for Answering Numerical-Reasoning Questions over Tabular Data", "authors": "Fan Zhou, Mengkang Hu, Haoyu Dong, zhoujun cheng, Fan Cheng, Shi Han and Dongmei Zhang", "abstract": "Existing auto-regressive pre-trained language models (PLMs) like T5 and BART, have been well applied to table question answering by UNIFIEDSKG and TAPEX, respectively, and demonstrated state-of-the-art results on multiple benchmarks. However, auto-regressive PLMs are challenged by recent emerging numerical reasoning datasets, such as TAT-QA, due to the error-prone implicit calculation. In this paper, we present TaCube, to pre-compute aggregation/arithmetic results for the table in advance, so that they are handy and readily available for PLMs to answer numerical reasoning questions. TaCube systematically and comprehensively covers a collection of computational operations over table segments. By simply concatenating TaCube to the input sequence of PLMs, it shows significant experimental effectiveness. TaCube promotes the F1 score from 49.6% to 66.2% on TAT-QA and achieves new state-of-the-art results on WikiTQ (59.6% denotation accuracy). TaCube's improvements on numerical reasoning cases are even more notable: on TAT-QA, TaCube promotes the exact match accuracy of BART-large by 39.6% on sum, 52.5% on average, 36.6% on substraction, and 22.2% on division. We believe that TaCube is a general and portable pre-computation solution that can be potentially integrated to various numerical reasoning frameworks", "track": "Question Answering", "label": 11}, {"loc": [4.996258735656738, 3.7615697383880615], "id": 3636, "title": "Rich Knowledge Sources Bring Complex Knowledge Conflicts: Recalibrating Models to Reflect Conflicting Evidence", "authors": "Hung-Ting Chen, Michael Zhang and Eunsol Choi", "abstract": "Question answering models can use rich knowledge sources \u2014 up to one hundred retrieved passages and parametric knowledge in the large-scale language model (LM). Prior work assumes information in such knowledge sources is consistent with each other, paying little attention to how models blend information stored in their LM parameters with that from retrieved evidence documents. In this paper, we simulate knowledge conflicts (i.e., where parametric knowledge suggests one answer and different passages suggest different answers) and examine model behaviors. We find retrieval performance heavily impacts which sources models rely on, and current models mostly rely on non-parametric knowledge\nin their best-performing settings. We discover a troubling trend that contradictions among knowledge sources affect model confidence only marginally. To address this issue, we present a new calibration study, where models are discouraged from presenting any single answer when presented with multiple conflicting answer candidates in retrieved evidences.", "track": "Question Answering", "label": 11}, {"loc": [7.945428848266602, 8.67428207397461], "id": 3725, "title": "QA Domain Adaptation using Hidden Space Augmentation and Self-Supervised Contrastive Adaptation", "authors": "Zhenrui Yue, Huimin Zeng, Bernhard Kratzwald, Stefan Feuerriegel and Dong Wang", "abstract": "Question answering (QA) has recently shown impressive results for answering questions from customized domains. Yet, a common challenge is to adapt QA models to an unseen target domain. In this paper, we propose a novel self-supervised framework called QADA for QA domain adaptation. QADA introduces a novel data augmentation pipeline used to augment training QA samples. Different from existing methods, we enrich the samples via hidden space augmentation. For questions, we introduce multi-hop synonyms and sample augmented token embeddings with Dirichlet distributions. For contexts, we develop an augmentation method which learns to drop context spans via a custom attentive sampling strategy. Additionally, contrastive learning is integrated in the proposed self-supervised adaptation framework QADA. Unlike existing approaches, we generate pseudo labels and propose to train the model via a novel attention-based contrastive adaptation method. The attention weights are used to build informative features for discrepancy estimation that helps the QA model separate answers and generalize across source and target domains. To the best of our knowledge, our work is the first to leverage hidden space augmentation and attention-based contrastive adaptation for self-supervised domain adaptation in QA. Our evaluation shows that QADA achieves considerable improvements on multiple target datasets over state-of-the-art baselines in QA domain adaptation.", "track": "Question Answering", "label": 11}, {"loc": [8.25633716583252, 7.153022766113281], "id": 3779, "title": "When FLUE Meets FLANG: Benchmarks and Large Pretrained Language Model for Financial Domain", "authors": "Raj Shah, Kunal Chawla, Dheeraj Eidnani, Agam Shah, Wendi Du, Sudheer Chava, Natraj Raman, Charese Smiley, Jiaao Chen and Diyi Yang", "abstract": "Pre-trained language models have shown impressive performance on a variety of tasks and domains. Previous research on financial language models usually employs a generic training scheme to train standard model architectures, without completely leveraging the richness of the financial data. We propose a novel domain specific Financial LANGuage model (FLANG) which uses financial keywords and phrases for better masking, together with span boundary objective and in-filing objective. Additionally, the evaluation benchmarks in the field have been limited. To this end, we contribute the Financial Language Understanding Evaluation (FLUE), an open-source comprehensive suite of benchmarks for the financial domain. These include new benchmarks across 5 NLP tasks in financial domain as well as common benchmarks used in the previous research. Experiments on these benchmarks suggest that our model outperforms those in prior literature on a variety of NLP tasks. Our models, code and benchmark data will be made publicly available on Github and Huggingface.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.3020665645599365, 4.6393513679504395], "id": 3812, "title": "Retrieval as Attention: End-to-end Learning of Retrieval and Reading within a Single Transformer", "authors": "Zhengbao Jiang, Luyu Gao, Zhiruo Wang, Jun Araki, Haibo Ding, Jamie Callan and Graham Neubig", "abstract": "Systems for knowledge-intensive tasks such as open-domain question answering (QA) usually consist of two stages: efficient retrieval of relevant documents from a large corpus and detailed reading of the selected documents. This is usually done through two separate models, a retriever that encodes the query and finds nearest neighbors, and a reader based on Transformers. These two components are usually modeled separately, which necessitates a cumbersome implementation and is awkward to optimize in an end-to-end fashion. In this paper, we revisit this design and eschew the separate architecture and training in favor of a single Transformer that performs retrieval as attention (RAA), and end-to-end training solely based on supervision from the end QA task. We demonstrate for the first time that an end-to-end trained single Transformer can achieve both competitive retrieval and QA performance on in-domain datasets, matching or even slightly outperforming state-of-the-art dense retrievers and readers. Moreover, end-to-end adaptation of our model significantly boosts its performance on out-of-domain datasets in both supervised and unsupervised settings, making our model a simple and adaptable end-to-end solution for knowledge-intensive tasks.", "track": "Question Answering", "label": 11}, {"loc": [8.81047248840332, 6.608438968658447], "id": 3820, "title": "Reproducibility in Computational Linguistics: Is Source Code Enough?", "authors": "Mohammad Arvan, Lu\u00eds Pina and Natalie Parde", "abstract": "The availability of source code has been put forward as one of the most critical factors for improving the reproducibility of scientific research. This work studies trends in source code availability at major computational linguistics conferences, namely, ACL, EMNLP, LREC, NAACL, and COLING. We observe positive trends, especially in conferences that actively promote reproducibility. We follow this by conducting a reproducibility study of eight papers published in EMNLP 2021, finding that source code releases leave much to be desired. Moving forward, we suggest all conferences require self-contained artifacts and provide a venue to evaluate such artifacts at the time of publication. Authors can include small-scale experiments and explicit scripts to generate each result to improve the reproducibility of their work.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.596644878387451, 4.738193035125732], "id": 3826, "title": "Generating Information-Seeking Conversations from Unlabeled Documents", "authors": "Gangwoo Kim, Sungdong Kim, Kang Min Yoo and Jaewoo Kang", "abstract": "Synthesizing datasets for conversational question answering (CQA) from unlabeled documents remains challenging due to its interactive nature.\nMoreover, while modeling information needs is an essential key, only few studies have discussed it.\nIn this paper, we introduce a novel framework, **SimSeek**, (**Sim**ulating information-**Seek**ing conversation from unlabeled documents), and compare its two variants.\nIn our baseline, **SimSeek-sym**, a questioner generates follow-up questions upon the predetermined answer by an answerer.\nOn the contrary, **SimSeek-asym** first generates the question and then finds its corresponding answer under the conversational context.\nOur experiments show that they can synthesize effective training resources for CQA and conversational search tasks.\nAs a result, conversations from **SimSeek-asym** not only make more improvements in our experiments but also are favorably reviewed in a human evaluation.\nWe finally release a large-scale resource of synthetic conversations, **Wiki-SimSeek**, containing 2 million CQA pairs built upon Wikipedia documents.\nWith the dataset, our CQA model achieves the state-of-the-art performance on a recent CQA benchmark, QuAC.\nThe code and dataset are available at https://github.com/naver-ai/simseek", "track": "Question Answering", "label": 11}, {"loc": [6.162881851196289, 12.448103904724121], "id": 3867, "title": "Distill The Image to Nowhere: Inversion Knowledge Distillation for Multimodal Machine Translation", "authors": "RU Peng, Yawen Zeng and Jake Zhao", "abstract": "Past works on multimodal machine translation (MMT) elevate bilingual setup by incorporating additional aligned vision information.\nHowever, an image-must requirement of the multimodal dataset largely hinders MMT's development --- namely that it demands an aligned form of [image, source text, target text].\nThis limitation is generally troublesome during the inference phase especially when the aligned image is not provided as in the normal NMT setup.\nThus, in this work, we introduce IKD-MMT, a novel MMT framework to support the image-free inference phase via an inversion knowledge distillation scheme.\nIn particular, a multimodal feature generator is executed with a knowledge distillation module, which directly generates the multimodal feature from (only) source texts as the input.\nWhile there have been a few prior works entertaining the possibility to support image-free inference for machine translation, their performances have yet to rival the image-must translation.\nIn our experiments, we identify our method as the first image-free approach to comprehensively rival or even surpass (almost) all image-must frameworks, and achieved the state-of-the-art result on the often-used Multi30k benchmark. Our code and data are available\nat: https://github.com/pengr/IKD-mmt/tree/master..", "track": "Machine Translation", "label": 10}, {"loc": [7.219461441040039, 4.67414665222168], "id": 3871, "title": "A Multifaceted Framework to Evaluate Evasion, Content Preservation, and Misattribution in Authorship Obfuscation Techniques", "authors": "Malik Altakrori, Thomas Scialom, Benjamin C. M. Fung and Jackie Chi Kit Cheung", "abstract": "Authorship obfuscation techniques have commonly been evaluated based on their ability to hide the author's identity (evasion) while preserving the content of the original text. However, to avoid overstating the systems' effectiveness, evasion detection must be evaluated using competitive identification techniques in settings that mimic real-life scenarios, and the outcomes of the content-preservation evaluation have to be interpretable by potential users of these obfuscation tools. Motivated by recent work on cross-topic authorship identification and content preservation in summarization, we re-evaluate different authorship obfuscation techniques on detection evasion and content preservation. Furthermore, we propose a new information-theoretic measure to characterize the misattribution harm that can be caused by detection evasion. Our results reveal key weaknesses in state-of-the-art obfuscation techniques and a surprisingly competitive effectiveness from a back-translation baseline in all evaluation aspects.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.763286590576172, 4.703256130218506], "id": 3904, "title": "SafeText: A Benchmark for Exploring Physical Safety in Language Models", "authors": "Sharon Levy, Emily Allaway, Melanie Subbiah, Lydia Chilton, Desmond Patton, Kathleen McKeown and William Yang Wang", "abstract": "Understanding what constitutes safe text is an important issue in natural language processing and can often prevent the deployment of models deemed harmful and unsafe. One such type of safety that has been scarcely studied is commonsense physical safety, i.e. text that is not explicitly violent and requires additional commonsense knowledge to comprehend that it leads to physical harm. We create the first benchmark dataset, SafeText, comprising real-life scenarios with paired safe and physically unsafe pieces of advice. We utilize SafeText to empirically study commonsense physical safety across various models designed for text generation and commonsense reasoning tasks. We find that state-of-the-art large language models are susceptible to the generation of unsafe text and have difficulty rejecting unsafe advice. As a result, we argue for further studies of safety and the assessment of commonsense physical safety in models before release.", "track": "Ethics", "label": 21}, {"loc": [7.578199863433838, 9.050206184387207], "id": 3992, "title": "Ground-Truth Labels Matter: A Deeper Look into Input-Label Demonstrations", "authors": "Kang Min Yoo, Junyeob Kim, Hyuhng Joon Kim, Hyunsoo Cho, Hwiyeol Jo, Sang-Woo Lee, Sang-goo Lee and Taeuk Kim", "abstract": "Despite recent explosion of interests in in-context learning, the underlying mechanism and the precise impact of the quality of demonstrations remain elusive.\nIntuitively, ground-truth labels should have as much impact in in-context learning (ICL) as supervised learning, but recent work reported that the input-label correspondence is significantly less important than previously thought.\nIntrigued by this counter-intuitive observation, we re-examine the importance of ground-truth labels in in-context learning.\nWith the introduction of two novel metrics, namely Label-Correctness Sensitivity and Ground-truth Label Effect Ratio (GLER), we were able to conduct quantifiable analysis on the impact of ground-truth label demonstrations.\nThrough extensive analyses, we find that the correct input-label mappings can have varying impacts on the downstream in-context learning performances, depending on the experimental configuration.\nThrough additional studies, we identify key components, such as the verbosity of prompt templates and the language model size, as the controlling factor to achieve more noise-resilient ICL.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.582016468048096, 7.6970062255859375], "id": 4233, "title": "Exploiting domain-slot related keywords description for Few-Shot Cross-Domain Dialogue State Tracking", "authors": "Gao Qixiang, Guanting Dong, Yutao Mou, Liwen Wang, Chen Zeng, Daichi Guo, Mingyang Sun and Weiran Xu", "abstract": "Collecting dialogue data with domain-slot-value labels for dialogue state tracking (DST) could be a costly process. In this paper, we propose a novel framework based on domain-slot related description to tackle the challenge of few-shot cross-domain DST. Specifically, we design an extraction module to extract domain-slot related verbs and nouns in the dialogue. Then, we integrates them into the description, which aims to prompt the model to identify the slot information. Furthermore, we introduce a random sampling strategy to improve the domain generalization ability of the model. We utilize a pre-trained model to encode contexts and description and generates answers with an auto-regressive manner. Experimental results show that our approaches substantially outperform the existing few-shot DST methods on MultiWOZ and gain strong improvements on the slot accuracy comparing to existing slot description methods.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.780959129333496, 6.335411548614502], "id": 4291, "title": "CoCoA: An Encoder-Decoder Model for Controllable Code-switched Generation", "authors": "Sneha Mondal, Ritika ., Shreya Pathak, Preethi Jyothi and Aravindan Raghuveer", "abstract": "Code-switching has seen growing interest in recent years as an important multilingual NLP phenomenon. Generating code-switched text for data augmentation has been sufficiently well-explored. However, there is no prior work on generating code-switched text with fine-grained control on the degree of code-switching and the lexical choices used to convey formality. Due to the inherently high diversity in code-switched text, exercising fine-grained control over generated text is very useful both from a generation perspective and a data augmentation perspective. We present CoCoa, an encoder-decoder translation model that converts monolingual Hindi text to Hindi-English code-switched text with both encoder-side and decoder-side interventions to achieve fine-grained controllable generation. CoCoa can be invoked at test-time to synthesize code-switched text that is simultaneously faithful to syntactic and lexical attributes relevant to code-switching. CoCoa outputs were subjected to rigorous subjective and objective evaluations. Human evaluations establish that our outputs are of superior quality while being faithful to the desired attributes. We show significantly improved BLEU scores when compared with human-generated CS text. Compared to competitive baselines, we show 10% reduction in perplexity on a language modeling task and also demonstrate clear improvements on a downstream code-switched sentiment analysis task.", "track": "Multilinguality", "label": 13}, {"loc": [8.307180404663086, 7.350909233093262], "id": 4366, "title": "Towards Climate Awareness in NLP Research", "authors": "Daniel Hershcovich, Nicolas Webersinke, Mathias Kraus, Julia Bingler and Markus Leippold", "abstract": "The climate impact of AI, and NLP research in particular, has become a serious issue given the enormous amount of energy that is increasingly being used for training and running computational models. Consequently, increasing focus is placed on efficient NLP. However, this important initiative lacks simple guidelines that would allow for systematic climate reporting of NLP research. We argue that this deficiency is one of the reasons why very few publications in NLP report key figures that would allow a more thorough examination of environmental impact, and present a quantitative survey to demonstrate this. As a remedy, we propose a climate performance model card with the primary purpose of being practically usable with only limited information about experiments and the underlying computer hardware. We describe why this step is essential to increase awareness about the environmental impact of NLP research and, thereby, paving the way for more thorough discussions.", "track": "Theme Track", "label": 18}, {"loc": [4.247328281402588, 7.546331882476807], "id": 4386, "title": "Navigating Connected Memories with a Task-oriented Dialog System", "authors": "Satwik Kottur, Seungwhan Moon, Alborz Geramifard and Babak Damavandi", "abstract": "Recent years have seen an increasing trend in the volume of personal media captured by users, thanks to the advent of smartphones and smart glasses, resulting in large media collections. Despite conversation being an intuitive human-computer interface, current efforts focus mostly on single-shot natural language based media retrieval to aid users query their media and re-live their memories. This severely limits the search functionality as users can neither ask follow-up queries nor obtain information without first formulating a single-turn query.\n\nIn this work, we propose dialogs for connected memories as a powerful tool to empower users to search their media collection through a multi-turn, interactive conversation. Towards this, we collect a new task-oriented dialog dataset COMET, which contains 11.5k user\u2194assistant dialogs (totalling 103k utterances), grounded in simulated personal memory graphs. We employ a resource-efficient, two-phase data collection pipeline that uses: (1) a novel multimodal dialog simulator that generates synthetic dialog flows grounded in memory graphs, and, (2) manual paraphrasing to obtain natural language utterances. We analyze COMET, formulate four main tasks to benchmark meaningful progress, and adopt state-of-the-art language models as strong baselines, in order to highlight the multimodal challenges captured by our dataset.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.869048118591309, 7.9376115798950195], "id": 4390, "title": "Language Model Decomposition: Quantifying the Dependency and Correlation of Language Models", "authors": "Hao Zhang", "abstract": "Pre-trained language models (LMs), such as BERT (Devlin et al., 2018) and its variants, have led to significant improvements on various NLP tasks in past years. However, a theoretical framework for studying their relationships is still missing. In this paper, we fill this gap by investigating the linear dependency between pre-trained LMs. The linear dependency of LMs is defined analogously to the linear dependency of vectors. We propose Language Model Decomposition (LMD) to represent a LM using a linear combination of other LMs as basis, and derive the closed-form solution. A goodness-of-fit metric for LMD similar to the coefficient of determination is defined and used to measure the linear dependency of a set of LMs. In experiments, we find that BERT and eleven (11) BERT-like LMs are 91% linearly dependent. This observation suggests that current state-of-the-art (SOTA) LMs are highly \"correlated\". To further advance SOTA we need more diverse and novel LMs that are less dependent on existing LMs.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.825882911682129, 9.34489917755127], "id": 4405, "title": "SynGEC: Syntax-Enhanced Grammatical Error Correction with a Tailored GEC-Oriented Parser", "authors": "Yue Zhang, Bo Zhang, Zhenghua Li, Zuyi Bao, Chen Li and Min Zhang", "abstract": "This work proposes a syntax-enhanced grammatical error correction (GEC) approach named SynGEC that effectively incorporates dependency syntactic information into the encoder part of GEC models. The key challenge for this idea is that off-the-shelf parsers are unreliable when processing ungrammatical sentences. To confront this challenge, we propose to build a tailored GEC-oriented parser (GOPar) using parallel GEC training data as a pivot. First, we design an extended syntax representation scheme that allows us to represent both grammatical errors and syntax in a unified tree structure. Then, we obtain parse trees of the source incorrect sentences by projecting trees of the target correct sentences. Finally, we train GOPar with such projected trees. For GEC, we employ the graph convolution network to encode source-side syntactic information produced by GOPar, and fuse them with the outputs of the Transformer encoder. Experiments on mainstream English and Chinese GEC datasets show that our proposed SynGEC approach consistently and substantially outperforms strong baselines and achieves competitive performance. Our code and data are all publicly available at https://github.com/HillZhang1999/SynGEC.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [4.457025051116943, 5.449256420135498], "id": 4436, "title": "Varifocal Question Generation for Fact-checking", "authors": "Nedjma Djouhra Ousidhoum, Zhangdie Yuan and Andreas Vlachos", "abstract": "Fact-checking requires retrieving evidence related to a claim under investigation. The task can be formulated as question generation based on a claim, followed by question answering.\nHowever, recent question generation approaches assume that the answer is known and typically contained in a passage given as input,\nwhereas such passages \nare what is being sought when verifying a claim.\nIn this paper, we present {\\it Varifocal}, a method that generates questions based on different focal points within a given claim, i.e.\\ different spans of the claim and its metadata, such as its source and date.\nOur method outperforms previous work on a fact-checking question generation dataset on a wide range of automatic evaluation metrics.\nThese results are corroborated by our manual evaluation, which indicates that our method generates more relevant and informative questions.\nWe further demonstrate the potential of focal points in generating sets of clarification questions for product descriptions.", "track": "NLP Applications", "label": 0}, {"loc": [9.337663650512695, 6.2572102546691895], "id": 4439, "title": "Bilingual Lexicon Induction for Low-Resource Languages using Graph Matching via Optimal Transport", "authors": "Kelly Marchisio, Ali Saad-Eldin, Kevin Duh, carey e. priebe and Philipp Koehn", "abstract": "Bilingual lexicons form a critical component of various natural language processing applications, including unsupervised and semisupervised machine translation and crosslingual information retrieval. In this work, we improve bilingual lexicon induction performance across 40 language pairs with a graph-matching method based on optimal transport. The method is especially strong with low amounts of supervision.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [6.16102409362793, 5.918120384216309], "id": 4441, "title": "Whose Language Counts as High Quality? Measuring Language Ideologies in Text Data Selection", "authors": "Suchin Gururangan, Dallas Card, Sarah K. Dreier, Emily K. Gade, Leroy Zhifei Wang, Zeyu Wang, Luke Zettlemoyer and Noah A. Smith", "abstract": "Language models increasingly rely on massive web crawls for diverse text data. However, these sources are rife with undesirable content. As such, resources like Wikipedia, books, and news often serve as anchors for automatically selecting web text most suitable for language modeling, a process typically referred to as quality filtering. Using a new dataset of U.S. high school newspaper articles---written by students from across the country---we investigate whose language is preferred by the quality filter used for GPT-3. We find that newspapers from larger schools, located in wealthier, educated, and urban zones (ZIP codes) are more likely to be classified as high quality. We also show that this quality measurement is unaligned with other sensible metrics, such as factuality or literary acclaim. We argue that privileging any corpus as high quality entails a language ideology, and more care is needed to construct training corpora for language models, with better transparency and justification for the inclusion or exclusion of various texts.", "track": "Theme Track", "label": 18}, {"loc": [1.1150788068771362, 7.786769390106201], "id": 4446, "title": "ConReader: Exploring Implicit Relations in Contracts for Contract Clause Extraction", "authors": "Weiwen Xu, Yang Deng, Wenqiang Lei, Wenlong ZHAO, Tat-Seng Chua and Wai Lam", "abstract": "We study automatic Contract Clause Extraction (CCE) by modeling implicit relations in legal contracts. Existing CCE methods mostly treat contracts as plain text, creating a substantial barrier to understanding contracts of high complexity. In this work, we first comprehensively analyze the complexity issues of contracts and distill out three implicit relations commonly found in contracts, namely, 1) Long-range Context Relation that captures the correlations of distant clauses; 2) Term-Definition Relation that captures the relation between important terms with their corresponding definitions, and 3) Similar Clause Relation that captures the similarities between clauses of the same type. Then we propose a novel framework ConReader to exploit the above three relations for better contract understanding and improving CCE. Experimental results show that ConReader makes the prediction more interpretable and achieves new state-of-the-art on two CCE tasks in both conventional and zero-shot settings.", "track": "NLP Applications", "label": 0}, {"loc": [7.782354831695557, 8.191479682922363], "id": 4456, "title": "Training Dynamics for Curriculum Learning: A Study on Monolingual and Cross-lingual NLU", "authors": "Fenia Christopoulou, Gerasimos Lampouras and Ignacio Iacobacci", "abstract": "Curriculum Learning (CL) is a technique of training models via ranking examples in a typically increasing difficulty trend with the aim of accelerating convergence and improving generalisability. Current approaches for Natural Language Understanding (NLU) tasks use CL to improve in-distribution data performance often via heuristic-oriented or task-agnostic difficulties. In this work, instead, we employ CL for NLU by taking advantage of training dynamics as difficulty metrics, i.e., statistics that measure the behavior of the model at hand on specific task-data instances during training and propose modifications of existing CL schedulers based on these statistics. Differently from existing works, we focus on evaluating models on in-distribution (ID), out-of-distribution (OOD) as well as zero-shot (ZS) cross-lingual transfer datasets. We show across several NLU tasks that CL with training dynamics can result in better performance mostly on zero-shot cross-lingual transfer and OOD settings with improvements up by 8.5% in certain cases. Overall, experiments indicate that training dynamics can lead to better performing models with smoother training compared to other difficulty metrics while being 20% faster on average. In addition, through analysis we shed light on the correlations of task-specific versus task-agnostic metrics.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.757943153381348, 8.310791969299316], "id": 4475, "title": "Revisiting Parameter-Efficient Tuning: Are We Really There Yet?", "authors": "Guanzheng Chen, Fangyu Liu, Zaiqiao Meng and Shangsong Liang", "abstract": "Parameter-Efficient Tuning (PETuning) methods have been deemed by many as the new paradigm for using pretrained language models (PLMs). By tuning just a fraction amount of parameters comparing to full model finetuning, PETuning methods claim to have achieved performance on par with or even better than finetuning. In this work, we take a step back and re-examine these PETuning methods by conducting the first comprehensive investigation into the training and evaluation of them. We found the problematic validation and testing practice in current studies, when accompanied by the instability nature of PETuning methods, has led to unreliable conclusions. When being compared under a truly fair evaluation protocol, PETuning cannot yield consistently competitive performance while finetuning remains to be the best-performing method in medium- and high-resource settings. We delve deeper into the cause of the instability and observed that the number of trainable parameters and training iterations are two main factors: reducing trainable parameters and prolonging training iterations may lead to higher stability in PETuning methods.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.404162645339966, 7.303460597991943], "id": 4482, "title": "Transfer Learning from Semantic Role Labeling to Event Argument Extraction with Template-based Slot Querying", "authors": "Zhisong Zhang, Emma Strubell and Eduard Hovy", "abstract": "In this work, we investigate transfer learning from semantic role labeling (SRL) to event argument extraction (EAE), considering their similar argument structures. We view the extraction task as a role querying problem, unifying various methods into a single framework. There are key discrepancies on role labels and distant arguments between semantic role and event argument annotations. To mitigate these discrepancies, we specify natural language-like queries to tackle the label mismatch problem and devise argument augmentation to recover distant arguments. We show that SRL annotations can serve as a valuable resource for EAE, and a template-based slot querying strategy is especially effective for facilitating the transfer. In extensive evaluations on two English EAE benchmarks, our proposed model obtains impressive zero-shot results by leveraging SRL annotations, reaching nearly 80% of the fullysupervised scores. It further provides benefits in low-resource cases, where few EAE annotations are available. Moreover, we show that our approach generalizes to cross-domain and multilingual scenarios.", "track": "Information Extraction", "label": 5}, {"loc": [9.246109962463379, 6.88762903213501], "id": 4485, "title": "Calibrating Zero-shot Cross-lingual (Un-)structured Predictions", "authors": "Zhengping Jiang, Anqi Liu and Benjamin Van Durme", "abstract": "We investigate model calibration in the setting of zero-shot cross-lingual transfer with large-scale pre-trained language models. The level of model calibration is an important metric for evaluating the trustworthiness of predictive models. There exists an essential need for model calibration when natural language models are deployed in critical tasks. We study different post-training calibration methods in structured and unstructured prediction tasks. We find that models trained with data from the source language become less calibrated when applied to the target language and that calibration errors increase with intrinsic task difficulty and relative sparsity of training data. Moreover, we observe a potential connection between the level of calibration error and an earlier proposed measure of the distance from English to other languages. Finally, our comparison demonstrates that among other methods Temperature Scaling (TS) generalizes well to distant languages, but TS fails to calibrate more complex confidence estimation in structured predictions compared to more expressive alternatives like Gaussian Process Calibration.", "track": "Multilinguality", "label": 13}, {"loc": [6.0487213134765625, 8.463147163391113], "id": 4489, "title": "PRINCE: Prefix-Masked Decoding for Knowledge Enhanced Sequence-to-Sequence Pre-Training", "authors": "Song Xu, Haoran Li, Peng Yuan, Youzheng Wu and Xiaodong He", "abstract": "Pre-trained Language Models (PLMs) have shown effectiveness in various Natural Language Processing (NLP) tasks. Denoising autoencoder is one of the most successful pre-training frameworks, learning to recompose the original text given a noise-corrupted one. The existing studies mainly focus on injecting noises into the input. This paper introduces a simple yet effective pre-training paradigm, equipped with a knowledge-enhanced decoder that predicts the next entity token with noises in the prefix, explicitly strengthening the representation learning of entities that span over multiple input tokens. Specifically, when predicting the next token within an entity, we feed masks into the prefix in place of some of the previous ground-truth tokens that constitute the entity. Our model achieves new state-of-the-art results on two knowledge-driven data-to-text generation tasks with up to 2% BLEU gains.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.7194936275482178, 9.547513008117676], "id": 4531, "title": "How Far are We from Robust Long Abstractive Summarization?", "authors": "Huan Yee Koh, Jiaxin Ju, He Zhang, Ming Liu and Shirui Pan", "abstract": "Abstractive summarization has made tremendous progress in recent years. In this work, we perform fine-grained human annotations to evaluate long document abstractive summarization systems (i.e., models and metrics) with the aim of implementing them to generate reliable summaries. For long document abstractive models, we show that the constant strive for state-of-the-art ROUGE results can lead us to generate more relevant summaries but not factual ones. For long document evaluation metrics, human evaluation results show that ROUGE remains the best at evaluating the relevancy of a summary. It also reveals important limitations of factuality metrics in detecting different types of factual errors and the reasons behind the effectiveness of BARTScore. We then suggest promising directions in the endeavor of developing factual consistency metrics. Finally, we release our annotated long document dataset with the hope that it can contribute to the development of metrics across a broader range of summarization settings.", "track": "Summarization", "label": 14}, {"loc": [6.900421142578125, 5.912875652313232], "id": 4550, "title": "Measuring Context-Word Biases in Lexical Semantic Datasets", "authors": "Qianchu Liu, Diana McCarthy and Anna Korhonen", "abstract": "State-of-the-art pretrained contextualized models (PCM) eg. BERT use tasks such as WiC and WSD to evaluate their word-in-context representations. This inherently assumes that performance in these tasks reflect how well a model represents the coupled word and context semantics. We question this assumption by presenting the first quantitative analysis on the context-word interaction being tested in major contextual lexical semantic tasks. To achieve this, we run probing baselines on masked input, and propose measures to calculate and visualize the degree of context or word biases in existing datasets. The analysis was performed on both models and humans. Our findings demonstrate that models are usually not being tested for word-in-context semantics in the same way as humans are in these tasks, which helps us better understand the model-human gap. Specifically, to PCMs, most existing datasets fall into the extreme ends (the retrieval-based tasks exhibit strong target word bias while WiC-style tasks and WSD show strong context bias); In comparison, humans are less biased and achieve much better performance when both word and context are available than with masked input. We recommend our framework for understanding and controlling these biases for model interpretation and future task design.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.12595796585083, 7.601161479949951], "id": 4567, "title": "Iteratively Prompt Pre-trained Language Models for Chain of Thought", "authors": "Boshi Wang, Xiang Deng and Huan Sun", "abstract": "While Pre-trained Language Models (PLMs) internalize a great amount of world knowledge, they have been shown incapable of recalling these knowledge to solve tasks requiring complex & multi-step reasoning. Similar to how humans develop a \"chain of thought\" for these tasks, how can we equip PLMs with such abilities? In this work, we explore an iterative prompting framework, a new prompting paradigm which progressively elicits relevant knowledge from PLMs for multi-step inference. We identify key limitations of existing prompting methods, namely they are either restricted to queries with a single identifiable relation/predicate, or being agnostic to input contexts, which makes it difficult to capture variabilities across different inference steps. We propose an iterative context-aware prompter, which addresses these limitations by learning to dynamically synthesize prompts conditioned on the current step's contexts. Experiments on three datasets involving multi-step reasoning show the effectiveness of the iterative scheme and the context-aware prompter design.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.290417671203613, 6.969555854797363], "id": 4584, "title": "Unobserved Local Structures Make Compositional Generalization Hard", "authors": "Ben Bogin, Shivanshu Gupta and Jonathan Berant", "abstract": "While recent work has shown that sequence-to-sequence models struggle to generalize to new compositions (termed compositional generalization), little is known on what makes compositional generalization hard on a particular test instance. In this work, we investigate the factors that make generalization to certain test instances challenging. We first substantiate that some examples are more difficult than others by showing that different models consistently fail or succeed on the same test instances. Then, we propose a criterion for the difficulty of an example: a test instance is hard if it contains a local structure that was not observed at training time. We formulate a simple decision rule based on this criterion and empirically show it predicts instance-level generalization well across 5 different semantic parsing datasets, substantially better than alternative decision rules. Last, we show local structures can be leveraged for creating difficult adversarial compositional splits and also to improve compositional generalization under limited training budgets by strategically selecting examples for the training set.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.916767597198486, 9.901144981384277], "id": 108, "title": "Mitigating Data Sparsity for Short Text Topic Modeling by Topic-Semantic Contrastive Learning", "authors": "Xiaobao Wu, Anh Tuan Luu and Xinshuai Dong", "abstract": "To overcome the data sparsity issue in short text topic modeling,\n existing methods commonly rely on data augmentation or the data characteristic of short texts\n to introduce more word co-occurrence information.\n However, most of them do not make full use of the augmented data or the data characteristic:\n they insufficiently learn the relations among samples in data,\n leading to dissimilar topic distributions of semantically similar text pairs.\n To better address data sparsity, in this paper\n we propose a novel short text topic modeling framework, Topic-Semantic Contrastive Topic Model (TSCTM).\n To sufficiently model the relations among samples,\n we employ a new contrastive learning method with efficient positive and negative sampling strategies based on topic semantics.\n This contrastive learning method refines the representations, enriches the learning signals, and thus mitigates the sparsity issue.\n Extensive experimental results show that our TSCTM outperforms state-of-the-art baselines\n regardless of the data augmentation availability, producing high-quality topics and topic distributions.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.99932599067688, 7.540581226348877], "id": 112, "title": "Back to the Future: Bidirectional Information Decoupling Network for Multi-turn Dialogue Modeling", "authors": "Yiyang Li, Hai Zhao and Zhuosheng Zhang", "abstract": "Multi-turn dialogue modeling as a challenging branch of natural language understanding (NLU), aims to build representations for machines to understand human dialogues, which provides a solid foundation for multiple downstream tasks. Recent studies of dialogue modeling commonly employ pre-trained language models (PrLMs) to encode the dialogue history as successive tokens, which is insufficient in capturing the temporal characteristics of dialogues. Therefore, we propose Bidirectional Information Decoupling Network (BiDeN) as a universal dialogue encoder, which explicitly incorporates both the past and future contexts and can be generalized to a wide range of dialogue-related tasks. Experimental results on datasets of different downstream tasks demonstrate the universality and effectiveness of our BiDeN.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.809638977050781, 5.537075519561768], "id": 113, "title": "Calibration Meets Explanation: A Simple and Effective Approach for Model Confidence Estimates", "authors": "Dongfang Li, Baotian Hu and Qingcai Chen", "abstract": "Calibration strengthens the trustworthiness of black-box models by producing better accurate confidence estimates on given examples. However, little is known about if model explanations can help confidence calibration. Intuitively, humans look at important features attributions and decide whether the model is trustworthy. Similarly, the explanations may tell us when the model might know and when it does not. Inspired by this, we propose a method named CME that leverages model explanations to make the model less confident with non-inductive attributions. The idea is that when the model is not highly confident, it is difficult to identify strong indications of any class, and the tokens accordingly do not have high attribution scores for any class and vice versa. We conduct extensive experiments on six datasets with two popular pre-trained language models in the in-domain and out-of-domain settings. The results show that CME improves calibration performance in all settings. The expected calibration errors are further reduced when combined with temperature scaling. Our findings highlight that model explanations can help calibrate posterior estimates.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [10.640687942504883, 7.656763076782227], "id": 131, "title": "Non-Autoregressive Neural Machine Translation: A Call for Clarity", "authors": "Robin M. Schmidt, Telmo Pires, Stephan Peitz and Jonas L\u00f6\u00f6f", "abstract": "Non-autoregressive approaches aim to improve the inference speed of translation models by only requiring a single forward pass to generate the output sequence instead of iteratively producing each predicted token. Consequently, their translation quality still tends to be inferior to their autoregressive counterparts due to several issues involving output token interdependence. In this work, we take a step back and revisit several techniques that have been proposed for improving non-autoregressive translation models and compare their combined translation quality and speed implications under third-party testing environments. We provide novel insights for establishing strong baselines using length prediction or CTC-based architecture variants and contribute standardized BLEU, chrF++, and TER scores using sacreBLEU on four translation tasks, which crucially have been missing as inconsistencies in the use of tokenized BLEU lead to deviations of up to 1.7 BLEU points. Our open-sourced code is integrated into fairseq for reproducibility.", "track": "Machine Translation", "label": 10}, {"loc": [10.084028244018555, 8.063382148742676], "id": 144, "title": "RED-ACE: Robust Error Detection for ASR using Confidence Embeddings", "authors": "Zorik Gekhman, Dina Zverinski, Jonathan Mallinson and Genady Beryozkin", "abstract": "ASR Error Detection (AED) models aim to post-process the output of Automatic Speech Recognition (ASR) systems, in order to detect transcription errors. Modern approaches usually use text-based input, comprised solely of the ASR transcription hypothesis, disregarding additional signals from the ASR model. Instead, we utilize the ASR system's word-level confidence scores for improving AED performance. Specifically, we add an ASR Confidence Embedding (ACE) layer to the AED model's encoder, allowing us to jointly encode the confidence scores and the transcribed text into a contextualized representation. Our experiments show the benefits of ASR confidence scores for AED, their complementary effect over the textual signal, as well as the effectiveness and robustness of ACE for combining these signals. To foster further research, we publish a novel AED dataset consisting of ASR outputs on the LibriSpeech corpus with annotated transcription errors.", "track": "NLP Applications", "label": 0}, {"loc": [7.887290000915527, 7.006139755249023], "id": 149, "title": "Fast-R2D2: A Pretrained Recursive Neural Network based on Pruned CKY for Grammar Induction and Text Representation", "authors": "Xiang Hu, Haitao Mi, Liang Li and Gerard de Melo", "abstract": "Chart-based models have shown great potential in unsupervised grammar induction, running recursively and hierarchically, but requiring O(n\u00b3) time-complexity. The Recursive Transformer based on Differentiable Trees (R2D2) makes it possible to scale to large language model pretraining even with a complex tree encoder, by introducing a heuristic pruning method.\nHowever, its rule-based pruning process suffers from local optima and slow inference. In this paper, we propose a unified R2D2 method that overcomes these issues. We use a top-down unsupervised parser as a model-guided pruning method, which also enables parallel encoding during inference. Our parser casts parsing as a split point scoring task by first scoring all split points for a given sentence and then using the highest-scoring one to recursively split a span into two parts. The reverse order of the splits is considered as the order of pruning in the encoder. We optimize the unsupervised parser by minimizing the Kullback\u2013Leibler distance between tree probabilities from the parser and the R2D2 model.\nOur experiments show that our Fast-R2D2 significantly improves the grammar induction quality and achieves competitive results in downstream tasks.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [0.4715983271598816, 7.105227470397949], "id": 165, "title": "A Localized Geometric Method to Match Knowledge in Low-dimensional Hyperbolic Space", "authors": "Bo Hui, Tian Xia and Wei-Shinn Ku", "abstract": "Matching equivalent entities across Knowledge graphs is a pivotal step for knowledge fusion. Previous approaches usually study the problem in Euclidean space. However, recent works have shown that hyperbolic space has a higher capacity than Euclidean space and hyperbolic embedding can represent the hierarchical structure in a knowledge graph. In this paper, we propose a localized geometric method to find equivalent entities in hyperbolic space. Specifically, we use a hyperbolic neural network to encode the lingual information of entities and the structure of both knowledge graphs into a low-dimensional hyperbolic space. To address the asymmetry of structure on different KGs and the localized nature of relations, we learn an instance-specific geometric mapping function based on rotation to match entity pairs. A contrastive loss function is used to train the model. The experiment verifies the power of low-dimensional hyperbolic space for entity matching and shows that our method outperforms the state of the art by a large margin.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.8399946689605713, 4.703279495239258], "id": 166, "title": "Memory-assisted prompt editing to improve GPT-3 after deployment", "authors": "Aman Madaan, Niket Tandon, Peter Clark and Yiming Yang", "abstract": "Large LMs such as GPT-3 are powerful, but can commit mistakes that are obvious to humans. For example, GPT-3 would mistakenly interpret \"What word is similar to good?\" to mean a homophone, while the user intended a synonym. Our goal is to effectively correct such errors via user interactions with the system but without retraining, which will be prohibitively costly. We pair GPT-3 with a growing memory of recorded cases where the model misunderstood the user's intents, along with user feedback for clarification. Such a memory allows our system to produce enhanced prompts for any new query based on the user feedback for error correction on similar cases in the past. On four tasks (two lexical tasks, two advanced ethical reasoning tasks), we show how a (simulated) user can interactively teach a deployed GPT-3, substantially increasing its accuracy over the queries with different kinds of misunderstandings by the GPT-3. Our approach is a step towards the low-cost utility enhancement for very large pre-trained LMs.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [6.190282821655273, 12.469200134277344], "id": 176, "title": "LVP-M3: Language-aware Visual Prompt for Multilingual Multimodal Machine Translation", "authors": "hongcheng guo, Jiaheng Liu, Haoyang Huang, Jian Yang, Zhoujun Li, Dongdong Zhang and Zheng Cui", "abstract": "Multimodal Machine Translation (MMT) focuses on enhancing text-only translation with visual features, which has attracted considerable attention from both natural language processing and computer vision communities. Recent advances still struggle to train a separate model for each language pair, which is costly and unaffordable when the number of languages increases in the real world. In other words, the multilingual multimodal machine translation (Multilingual MMT) task has not been investigated, which aims to handle the aforementioned issues by providing a shared semantic space for multiple languages. Besides, the image modality has no language boundaries, which is superior to bridging the semantic gap between languages. To this end,\nwe first propose the Multilingual MMT task by establishing two new Multilingual MMT benchmark datasets covering seven languages.\nThen, an effective baseline LVP-M3 using visual prompts is proposed to support translations between different languages,\nwhich includes three stages (token encoding, language-aware visual prompt generation, and language translation). Extensive experimental results on our constructed benchmark datasets demonstrate the effectiveness of LVP-M3 method for Multilingual MMT.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.137381553649902, 2.9900803565979004], "id": 178, "title": "PromptEHR: Conditional Electronic Healthcare Records Generation with Prompt Learning", "authors": "Zifeng Wang and Jimeng Sun", "abstract": "Accessing longitudinal multimodal Electronic Healthcare Records (EHRs) is challenging due to privacy concerns, which hinders the use of ML for healthcare applications. Synthetic EHRs generation bypasses the need to share sensitive real patient records. However, existing methods generate single-modal EHRs by unconditional generation or by longitudinal inference, which falls short of low flexibility and makes unrealistic EHRs. In this work, we propose to formulate EHRs generation as a text-to-text translation task by language models (LMs), which suffices to highly flexible event imputation during generation. We also design prompt learning to control the generation conditioned by numerical and categorical demographic features. We evaluate synthetic EHRs quality by two perplexity measures accounting for their longitudinal pattern (longitudinal imputation perplexity, lpl) and the connections cross modalities (cross-modality imputation perplexity, mpl). Moreover, we utilize two adversaries: membership and attribute inference attacks for privacy-preserving evaluation. Experiments on MIMIC-III data demonstrate the superiority of our methods on realistic EHRs generation (53.1\\% decrease of lpl and 45.3\\% decrease of mpl on average compared to the best baselines) with low privacy risks. Software is available at https://github.com/RyanWangZf/PromptEHR.", "track": "NLP Applications", "label": 0}, {"loc": [7.738099575042725, 3.5442287921905518], "id": 184, "title": "ROSE: Robust Selective Fine-tuning for Pre-trained Language Models", "authors": "Lan Jiang, Hao Zhou, Yankai Lin, Peng Li, Jie Zhou and Rui Jiang", "abstract": "Even though the large-scale language models have achieved excellent performances, they suffer from various adversarial attacks.\nA large body of defense methods has been proposed. \nHowever, they are still limited due to redundant attack search spaces and the inability to defend against various types of attacks.\nIn this work, we present a novel fine-tuning approach called \\textbf{RO}bust \\textbf{SE}letive fine-tuning (\\textbf{ROSE}) to address this issue.\nROSE conducts selective updates when adapting pre-trained models to downstream tasks, filtering out invaluable and unrobust updates of parameters.\nSpecifically, we propose two strategies: the first-order and second-order ROSE for selecting target robust parameters.\nThe experimental results show that ROSE achieves significant improvements in adversarial robustness on various downstream NLP tasks, and the ensemble method even surpasses both variants above.\nFurthermore, ROSE can be easily incorporated into existing fine-tuning methods to improve their adversarial robustness further.\nThe empirical analysis confirms that ROSE eliminates unrobust spurious updates during fine-tuning, leading to solutions corresponding to flatter and wider optima than the conventional method.\nCode is available at \\url{https://github.com/jiangllan/ROSE}.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.528061389923096, 1.8590787649154663], "id": 191, "title": "CodeRetriever: A Large Scale Contrastive Pre-Training Method for Code Search", "authors": "Xiaonan Li, Yeyun Gong, Yelong Shen, Xipeng Qiu, Hang Zhang, Bolun Yao, Weizhen Qi, Daxin Jiang, Weizhu Chen and Nan Duan", "abstract": "In this paper, we propose the CodeRetriever model, which learns the function-level code semantic representations through large-scale code-text contrastive pre-training. We adopt two contrastive learning schemes in CodeRetriever: unimodal contrastive learning and bimodal contrastive learning. For unimodal contrastive learning, we design an unsupervised learning approach to build semantic-related code pairs based on the documentation and function name. For bimodal contrastive learning, we leverage the documentation and in-line comments of code to build code-text pairs. Both contrastive objectives can fully leverage large-scale code corpus for pre-training. \nExtensive experimental results show that CodeRetriever achieves new state-of-the-art with significant improvement over existing code pre-trained models, on eleven domain/language-specific code search tasks with six programming languages in different code granularity (function-level, snippet-level and statement-level).\nThese results demonstrate the effectiveness and robustness of CodeRetriever.\nThe codes and resources are available at \\url{https://github.com/microsoft/AR2/tree/main/CodeRetriever}.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.7762837409973145, 3.47430682182312], "id": 198, "title": "Open-Topic False Information Detection on Social Networks with Contrastive Adversarial Learning", "authors": "Guanghui Ma, Chunming Hu, Ling Ge and Hong Zhang", "abstract": "Current works about false information detection based on conversation graphs on social networks focus primarily on two research streams from the standpoint of topic distribution: in-topic and cross-topic techniques, which assume that the data topic distribution is identical or cross, respectively. This signifies that all test data topics are seen or unseen by the model.\nHowever, these assumptions are too harsh for actual social networks that contain both seen and unseen topics simultaneously, hence restricting their practical application.\nIn light of this, this paper develops a novel open-topic scenario that is better suited to actual social networks. In this open-topic scenario, we empirically find that the existing models suffer from impairment in the detection performance for seen or unseen topic data, resulting in poor overall model performance. To address this issue, we propose a novel Contrastive Adversarial Learning Network, CALN, that employs an unsupervised topic clustering method to capture topic-specific features to enhance the model's performance for seen topics and an unsupervised adversarial learning method to align data representation distributions to enhance the model's generalisation to unseen topics.\nExperiments on two benchmark datasets and a variety of graph neural networks demonstrate the effectiveness of our approach.", "track": "NLP Applications", "label": 0}, {"loc": [6.4374680519104, 12.122154235839844], "id": 207, "title": "Mitigating Inconsistencies in Multimodal Sentiment Analysis under Uncertain Missing Modalities", "authors": "Jiandian Zeng, Jiantao Zhou and Tianyi Liu", "abstract": "For the missing modality problem in Multimodal Sentiment Analysis (MSA), the inconsistency phenomenon occurs when the sentiment changes due to the absence of a modality. The absent modality that determines the overall semantic can be considered as a key missing modality. However, previous works all ignored the inconsistency phenomenon, simply discarding missing modalities or solely generating associated features from available modalities. The neglect of the key missing modality case may lead to incorrect semantic results. To tackle the issue, we propose an Ensemble-based Missing Modality Reconstruction (EMMR) network to detect and recover semantic features of the key missing modality. Specifically, we first learn joint representations with remaining modalities via a backbone encoder-decoder network. Then, based on the recovered features, we check the semantic consistency to determine whether the absent modality is crucial to the overall sentiment polarity. Once the inconsistency problem due to the key missing modality exists, we integrate several encoder-decoder approaches for better decision making. Extensive experiments and analyses are conducted on CMU-MOSI and IEMOCAP datasets, validating the superiority of the proposed method.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [2.3089511394500732, 4.576399803161621], "id": 209, "title": "ConvTrans: Transforming Web Search Sessions for Conversational Dense Retrieval", "authors": "Kelong Mao, Zhicheng Dou, Hongjin Qian, Fengran Mo, Xiaohua Cheng and Zhao Cao", "abstract": "Conversational search provides users with a natural and convenient new search experience. Recently, conversational dense retrieval has shown to be a promising technique for realizing conversational search. However, as conversational search systems have not been widely deployed, it is hard to get large-scale real conversational search sessions and relevance labels to support the training of conversational dense retrieval. To tackle this data scarcity problem, previous methods focus on developing better few-shot learning approaches or generating pseudo relevance labels, but the data they use for training still heavily rely on manual generation.\n\nIn this paper, we present ConvTrans, a data augmentation method that can automatically transform easily-accessible web search sessions into conversational search sessions to fundamentally alleviate the data scarcity problem for conversational dense retrieval. ConvTrans eliminates the gaps between these two types of sessions in terms of session quality and query form to achieve effective session transformation. Extensive evaluations on two widely used conversational search benchmarks, i.e., CAsT-19 and CAsT-20, demonstrate that the same model trained on the data generated by ConvTrans can achieve comparable retrieval performance as it trained on high-quality but expensive artificial conversational search data.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [2.131030559539795, 7.54892110824585], "id": 229, "title": "MUSIED: A Benchmark for Event Detection from Multi-Source Heterogeneous Informal Texts", "authors": "Xi Xiangyu, Jianwei Lv, Shuaipeng Liu, Wei Ye, Fan Yang and Guanglu Wan", "abstract": "Event detection (ED) identifies and classifies event triggers from unstructured texts, serving as a fundamental task for information extraction. Despite the remarkable progress achieved in the past several years, most research efforts focus on detecting events from formal texts (e.g., news articles, Wikipedia documents, financial announcements). Moreover, the texts in each dataset are either from a single source or multiple yet relatively homogeneous sources. With massive amounts of user-generated text accumulating on the Web and inside enterprises, identifying meaningful events in these informal texts, usually from multiple heterogeneous sources, has become a problem of significant practical value. As a pioneering exploration that expands event detection to the scenarios involving informal and heterogeneous texts, we propose a new large-scale Chinese event detection dataset based on user reviews, text conversations, and phone conversations in a leading e-commerce platform for food service. We carefully investigate the proposed dataset's textual informality and multi-domain heterogeneity characteristics by inspecting data samples quantitatively and qualitatively. Extensive experiments with state-of-the-art event detection methods verify the unique challenges posed by these characteristics, indicating that multi-domain informal event detection remains an open problem and requires further efforts. Our benchmark and code are released at https://github.com/myeclipse/MUSIED.", "track": "Resources and Evaluation", "label": 1}, {"loc": [11.025067329406738, 6.808716773986816], "id": 230, "title": "Reproducibility Issues for BERT-based Evaluation Metrics", "authors": "Yanran Chen, Jonas Belouadi and Steffen Eger", "abstract": "Reproducibility is of utmost concern in machine learning and natural language processing (NLP). In the field of natural language generation (especially machine translation), the seminal paper of Post (2018) has pointed out problems of reproducibility of the dominant metric, BLEU, at the time of publication. Nowadays, BERT-based evaluation metrics considerably outperform BLEU. In this paper, we ask whether results and claims from four recent BERT-based metrics can be reproduced. We find that reproduction of claims and results often fails because of (i) heavy undocumented preprocessing involved in the metrics, (ii) missing code and (iii) reporting weaker results for the baseline metrics. (iv) In one case, the problem stems from correlating not to human scores but to a wrong column in the csv file, inflating scores by 5 points. Motivated by the impact of preprocessing, we then conduct a second study where we examine its effects more closely (for one of the metrics). We find that preprocessing can have large effects, especially for highly inflectional languages. In this case, the effect of preprocessing may be larger than the effect of the aggregation mechanism (e.g., greedy alignment vs. Word Mover Distance).", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.357364177703857, 5.941081523895264], "id": 239, "title": "Improving Multi-task Stance Detection with Multi-task Interaction Network", "authors": "Heyan Chai, Siyu Tang, Jinhao Cui, Ye Ding, Binxing Fang and Qing Liao", "abstract": "Stance detection aims to identify people's standpoints expressed in the text towards a target, which can provide powerful information for various downstream tasks.\nRecent studies have proposed multi-task learning models that introduce sentiment information to boost stance detection.\nHowever, they neglect to explore capturing the fine-grained task-specific interaction between stance detection and sentiment tasks, thus degrading performance.\nTo address this issue, this paper proposes a novel multi-task interaction network (MTIN) for improving the performance of stance detection and sentiment analysis tasks simultaneously.\nSpecifically, we construct heterogeneous task-related graphs to automatically identify and adapt the roles that a word plays with respect to a specific task. Also, a multi-task interaction module is designed to capture the word-level interaction between tasks, so as to obtain richer task representations.\nExtensive experiments on two real-world datasets show that our proposed approach outperforms state-of-the-art methods in both stance detection and sentiment analysis tasks.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [0.5278493165969849, 7.039675712585449], "id": 240, "title": "Neural-based Mixture Probabilistic Query Embedding for Answering FOL queries on Knowledge Graphs", "authors": "xiao long, Liansheng Zhuang, Li Aodi, Shafei Wang and Houqiang Li", "abstract": "Query embedding (QE)\u2014which aims to embed entities and first-order logical (FOL) queries in a vector space, has shown great power in answering FOL queries on knowledge graphs (KGs). Existing QE methods divide a complex query into a sequence of mini-queries according to its computation graph and perform logical operations on the answer sets of mini-queries to get answers. However, most of them assume that answer sets satisfy an individual distribution (e.g., Uniform, Beta, or Gaussian), which is often violated in real applications and limit their performance. In this paper, we propose a Neural-based Mixture Probabilistic Query Embedding Model (NMP-QEM) that encodes the answer set of each mini-query as a mixed Gaussian distribution with multiple means and covariance parameters, which can approximate any random distribution arbitrarily well in real KGs. Additionally, to overcome the difficulty in defining the closed solution of negation operation, we introduce neural-based logical operators of projection, intersection and negation for a mixed Gaussian distribution to answer all the FOL queries. Extensive experiments demonstrate that NMP-QEM significantly outperforms existing state-of-the-art methods on benchmark datasets. In NELL995, NMP-QEM achieves a 31\\% relative improvement over the state-of-the-art.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [4.671926975250244, 6.999411106109619], "id": 248, "title": "Improving Multi-turn Emotional Support Dialogue Generation with Lookahead Strategy Planning", "authors": "Yi Cheng, Wenge Liu, Wenjie Li, Jiashuo WANG, Ruihui Zhao, Bang Liu, Xiaodan Liang and Yefeng Zheng", "abstract": "Providing Emotional Support (ES) to soothe people in emotional distress is an essential capability in social interactions. Most existing researches on building ES conversation systems only considered single-turn interactions with users, which was over-simplified. In comparison, multi-turn ES conversation systems can provide ES more effectively, but face several new technical challenges, including: (1) how to adopt appropriate support strategies to achieve the long-term dialogue goal of comforting the user's emotion; (2) how to dynamically model the user's state. In this paper, we propose a novel system MultiESC to address these issues. For strategy planning, drawing inspiration from the A* search algorithm, we propose lookahead heuristics to estimate the future user feedback after using particular strategies, which helps to select strategies that can lead to the best long-term effects. For user state modeling, MultiESC focuses on capturing users' subtle emotional expressions and understanding their emotion causes. Extensive experiments show that MultiESC significantly outperforms competitive baselines in both dialogue generation and strategy planning.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.9762749671936035, 9.743143081665039], "id": 254, "title": "Conformal Predictor for Improving Zero-Shot Text Classification Efficiency", "authors": "Prafulla Kumar Choubey, Yu Bai, Chien-Sheng Wu, Wenhao Liu and Nazneen Rajani", "abstract": "Pre-trained language models (PLMs) have been shown effective for zero-shot (0shot) text classification. 0shot models based on natural language inference (NLI) and next sentence prediction (NSP) employ cross-encoder architecture and infer by making a forward pass through the model for each label-text pair separately. This increases the computational cost to make inferences linearly in the number of labels. In this work, we improve the efficiency of such cross-encoder-based 0shot models by restricting the number of likely labels using another fast base classifier-based conformal predictor (CP) calibrated on samples labeled by the 0shot model. Since a CP generates prediction sets with coverage guarantees, it reduces the number of target labels without excluding the most probable label based on the 0shot model. We experiment with three intent and two topic classification datasets. With a suitable CP for each dataset, we reduce the average inference time for NLI- and NSP-based models by 25.6% and 22.2% respectively, without dropping performance below the predefined error rate of 1%.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.8328033685684204, 3.931988000869751], "id": 259, "title": "Effective and Efficient Query-aware Snippet Extraction for Web Search", "authors": "Jingwei Yi, Fangzhao Wu, Chuhan Wu, Xiaolong Huang, Binxing Jiao, Guangzhong Sun and Xing Xie", "abstract": "Query-aware webpage snippet extraction is widely used in search engines to help users better understand the content of the returned webpages before clicking. The extracted snippet is expected to summarize the webpage in the context of the input query. Existing snippet extraction methods mainly rely on handcrafted features of overlapping words, which cannot capture deep semantic relationships between the query and webpages. Another idea is to extract the sentences which are most relevant to queries as snippets with existing text matching methods. However, these methods ignore the contextual information of webpages, which may be sub-optimal. In this paper, we propose an effective query-aware webpage snippet extraction method named DeepQSE. In DeepQSE, the concatenation of title, query and each candidate sentence serves as an input of query-aware sentence encoder, aiming to capture the fine-grained relevance between the query and sentences. Then, these query-aware sentence representations are modeled jointly through a document-aware relevance encoder to capture contextual information of the webpage. Since the query and each sentence are jointly modeled in DeepQSE, its online inference may be slow. Thus, we further propose an efficient version of DeepQSE, named Efficient-DeepQSE, which can significantly improve the inference speed of DeepQSE without affecting its performance. The core idea of Efficient-DeepQSE is to decompose the query-aware snippet extraction task into two stages, i.e., a coarse-grained candidate sentence selection stage where sentence representations can be cached, and a fine-grained relevance modeling stage. Experiments on two datasets validate the effectiveness and efficiency of our methods.", "track": "Summarization", "label": 14}, {"loc": [2.548063278198242, 4.718426704406738], "id": 281, "title": "You Only Need One Model for Open-domain Question Answering", "authors": "Haejun Lee, Akhil Kedia, Jongwon Lee, Ashwin Paranjape, Christopher Manning and Kyoung-Gu Woo", "abstract": "Recent approaches to Open-domain Question Answering refer to an external knowledge base using a retriever model, optionally rerank passages with a separate reranker model and generate an answer using another reader model. Despite performing related tasks, the models have separate parameters and are weakly-coupled during training. We propose casting the retriever and the reranker as internal passage-wise attention mechanisms applied sequentially within the transformer architecture and feeding computed representations to the reader, with the hidden representations progressively refined at each stage. This allows us to use a single question answering model trained end-to-end, which is a more efficient use of model capacity and also leads to better gradient flow. We present a pre-training method to effectively train this architecture and evaluate our model on the Natural Questions and TriviaQA open datasets. For a fixed parameter budget, our model outperforms the previous state-of-the-art model by 1.0 and 0.7 exact match scores.", "track": "Question Answering", "label": 11}, {"loc": [1.6088727712631226, 8.614110946655273], "id": 287, "title": "Generative Entity Typing with Curriculum Learning", "authors": "Siyu Yuan, Deqing Yang, Jiaqing Liang, Zhixu Li, Jinxi Liu, Jingyue Huang and Yanghua Xiao", "abstract": "Entity typing aims to assign types to the entity mentions in given texts. The traditional classification-based entity typing paradigm has two unignorable drawbacks: 1) it fails to assign an entity to the types beyond the predefined type set, and 2) it can hardly handle few-shot and zero-shot situations where many long-tail types only have few or even no training instances. To overcome these drawbacks, we propose a novel generative entity typing (GET) paradigm: given a text with an entity mention, the multiple types for the role that the entity plays in the text are generated with a pre-trained language model (PLM). However, PLMs tend to generate coarse-grained types after fine-tuning upon the entity typing dataset. In addition, only the heterogeneous training data consisting of a small portion of human-annotated data and a large portion of auto-generated but low-quality data are provided for model training. To tackle these problems, we employ curriculum learning (CL) to train our GET model on heterogeneous data, where the curriculum could be self-adjusted with the self-paced learning according to its comprehension of the type granularity and data heterogeneity. Our extensive experiments upon the datasets of different languages and downstream tasks justify the superiority of our GET model over the state-of-the-art entity typing models. The code has been released on https://github.com/siyuyuan/GET.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [1.7253130674362183, 8.752220153808594], "id": 297, "title": "SetGNER: General Named Entity Recognition as Entity Set Generation", "authors": "Yuxin He and Buzhou Tang", "abstract": "Recently, joint recognition of flat, nested and discontinuous entities has received increasing attention. Motivated by the observation that the target output of NER is essentially a set of sequences, we propose a novel entity set generation framework for general NER scenes in this paper. Different from sequence-to-sequence NER methods, our method does not force the entities to be generated in a predefined order and can get rid of the problem of error propagation and inefficient decoding. Distinguished from the set-prediction NER framework, our method treats each entity as a sequence and is capable of recognizing discontinuous mentions. Given an input sentence, the model first encodes the sentence in word-level and detects potential entity mentions based on the encoder's output, then reconstructs entity mentions from the detected entity heads in parallel. To let the encoder of our model capture better right-to-left semantic structure, we also propose an auxiliary Inverse Generation Training task. Extensive experiments show that our model (w/o. Inverse Generation Training) outperforms state-of-the-art generative NER models by a large margin on two discontinuous NER datasets, two nested NER datasets and one flat NER dataset. Besides, the auxiliary Inverse Generation Training task is found to further improve the model's performance on the five datasets.", "track": "Information Extraction", "label": 5}, {"loc": [3.684114933013916, 9.895216941833496], "id": 306, "title": "Opinion Summarization by Weak-Supervision from Mix-structured Data", "authors": "Yizhu Liu, Qi Jia and Kenny Zhu", "abstract": "Opinion summarization of multiple reviews suffers from the lack \nof reference summaries for training.\nMost previous approaches construct multiple reviews and their summary \nbased on textual similarities between reviews,\nresulting in information mismatch between the review input and the summary. \nIn this paper, we convert each review into a mix\nof structured and unstructured data, \nwhich we call opinion-aspect pairs (OAs) and implicit sentences (ISs).\nWe propose a new method to synthesize training pairs \nof such mix-structured data as input and the textual summary as output,\nand design a summarization model with OA encoder and IS encoder.\nExperiments show that our approach outperforms previous \nmethods on Yelp, Amazon and RottenTomatos datasets.", "track": "Summarization", "label": 14}, {"loc": [8.997915267944336, 6.5478835105896], "id": 320, "title": "Multi-level Distillation of Semantic Knowledge for Pre-training Multilingual Language Model", "authors": "Mingqi Li, Fei Ding, Dan Zhang, Long Cheng, Hongxin Hu and feng luo", "abstract": "Pre-trained multilingual language models play an important role in cross-lingual natural language understanding tasks. However, existing methods did not focus on learning the semantic structure of representation, and thus could not optimize their performance. In this paper, we propose Multi-level Multilingual Knowledge Distillation (MMKD), a novel method for improving multilingual language models. Specifically, we employ a teacher-student framework to adopt rich semantic representation knowledge in English BERT. We propose token-, word-, sentence-, and structure-level alignment objectives to encourage multiple levels of consistency between source-target pairs and correlation similarity between teacher and student models. We conduct experiments on cross-lingual evaluation benchmarks including XNLI, PAWS-X, and XQuAD. Experimental results show that MMKD outperforms other baseline models of similar size on XNLI and XQuAD and obtains comparable performance on PAWS-X. Especially, MMKD obtains significant performance gains on low-resource languages.", "track": "Multilinguality", "label": 13}, {"loc": [2.0102570056915283, 4.070786952972412], "id": 324, "title": "Empowering Dual-Encoder with Query Generator for Cross-Lingual Dense Retrieval", "authors": "Houxing Ren, Linjun Shou, Ning Wu, Ming Gong and Daxin Jiang", "abstract": "In monolingual dense retrieval, lots of works focus on how to distill knowledge from cross-encoder re-ranker to dual-encoder retriever and these methods achieve better performance due to the effectiveness of cross-encoder re-ranker. However, we find that the performance of the cross-encoder re-ranker is heavily influenced by the number of training samples and the quality of negative samples, which is hard to obtain in the cross-lingual setting. In this paper, we propose to use a query generator as the teacher in the cross-lingual setting, which is less dependent on enough training samples and high-quality negative samples. In addition to traditional knowledge distillation, we further propose a novel enhancement method, which uses the query generator to help the dual-encoder align queries from different languages, but does not need any additional parallel sentences. The experimental results show that our method outperforms the state-of-the-art methods on two benchmark datasets.", "track": "Multilinguality", "label": 13}, {"loc": [4.840381145477295, 4.609951019287109], "id": 340, "title": "R2F: A General Retrieval, Reading and Fusion Framework for Document-level Natural Language Inference", "authors": "Hao Wang, Yixin Cao, Yangguang Li, Zhen Huang, Kun Wang and Jing Shao", "abstract": "Document-level natural language inference (DOCNLI) is a new challenging task in natural language processing, aiming at judging the entailment relationship between a pair of hypothesis and premise documents. Current datasets and baselines largely follow sentence-level settings, but fail to address the issues raised by longer documents. In this paper, we establish a general solution, named Retrieval, Reading and Fusion (R2F) framework, and a new setting, by analyzing the main challenges of DOCNLI: interpretability, long-range dependency, and cross-sentence inference. The basic idea of the framework is to simplify document-level task into a set of sentence-level tasks, and improve both performance and interpretability with the power of evidence. For each hypothesis sentence, the framework retrieves evidence sentences from the premise, and reads to estimate its credibility. Then the sentence-level results are fused to judge the relationship between the documents. For the setting, we contribute complementary evidence and entailment label annotation on hypothesis sentences, for interpretability study. Our experimental results show that R2F framework can obtain state-of-the-art performance and is robust for diverse evidence retrieval methods. Moreover, it can give more interpretable prediction results. Our model and code are released at https://github.com/phoenixsecularbird/R2F.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [9.0950927734375, 6.504434585571289], "id": 351, "title": "Revisiting Pre-trained Language Models and their Evaluation for Arabic Natural Language Processing", "authors": "Abbas Ghaddar, Yimeng Wu, Sunyam Bagga, Ahmad Rashid, Khalil Bibi, Mehdi Rezagholizadeh, Chao Xing, Yasheng Wang, Xinyu Duan, Zhefeng Wang, baoxing Huai, Xin Jiang, Qun Liu and Phillippe Langlais", "abstract": "There is a growing body of work in recent years to develop pre-trained language models (PLMs) for the Arabic language. This work addresses two major problems in existing Arabic PLMs that limit the progress of the Arabic NLU and NLG fields. First, existing Arabic PLMs are not well-explored and their pre-training can be improved significantly using a more methodical approach. Second, there is a lack of systematic and reproducible evaluation of these models in the literature. We revisit both the pre-training and evaluation of Arabic PLMs. In terms of pre-training, we explore the impact of the quality of the pretraining data, the size of the model, and the incorporation of character-level information on Arabic PLM. As a result, we release three new Arabic BERT-style models ( JABER, Char-JABER, and SABER), and two T5-style models (AT5S and AT5B). In terms of evaluation, we conduct a comprehensive empirical study to systematically evaluate the performance of existing state-of-the-art models on ALUE, a leaderboard-powered benchmark for Arabic NLU tasks, and on a subset of the Arabic generative tasks. We show that our models significantly outperform existing Arabic PLMs and achieve a new state-of-the-art performance on discriminative and generative Arabic NLU and NLG tasks. Our models and source code to reproduce results will be made available upon acceptance.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.739131212234497, 4.600147247314453], "id": 357, "title": "KECP: Knowledge Enhanced Contrastive Prompting for Few-shot Extractive Question Answering", "authors": "Jianing Wang, Chengyu Wang, Minghui Qiu, Qiuhui Shi, Hongbin Wang, jun huang and Ming Gao", "abstract": "Extractive Question Answering (EQA) is one of the most essential tasks in Machine Reading Comprehension (MRC), which can be solved by fine-tuning the span selecting heads of Pre-trained Language Models (PLMs). However, most existing approaches for MRC may perform poorly in the few-shot learning scenario. To solve this issue, we propose a novel framework named Knowledge Enhanced Contrastive Prompt-tuning (KECP). Instead of adding pointer heads to PLMs, we introduce a seminal paradigm for EQA that transforms the task into a non-autoregressive Masked Language Modeling (MLM) generation problem. Simultaneously, rich semantics from the external knowledge base (KB) and the passage context support enhancing the query's representations. In addition, to boost the performance of PLMs, we jointly train the model by the MLM and contrastive learning objectives. Experiments on multiple benchmarks demonstrate that our method consistently outperforms state-of-the-art approaches in few-shot settings by a large margin.", "track": "Question Answering", "label": 11}, {"loc": [7.193092346191406, 7.633299350738525], "id": 358, "title": "Knowledge Prompting in Pre-trained Language Model for Natural Language Understanding", "authors": "Jianing Wang, Wenkang Huang, Minghui Qiu, Qiuhui Shi, Hongbin Wang, Xiang Li and Ming Gao", "abstract": "Knowledge-enhanced Pre-trained Language Model (PLM) has recently received significant attention, which aims to incorporate factual knowledge into PLMs. However, most existing methods modify the internal structures of fixed types of PLMs by stacking complicated modules, and introduce redundant and irrelevant factual knowledge from knowledge bases (KBs). In this paper, to address these problems, we introduce a seminal knowledge prompting paradigm and further propose a knowledge-prompting-based PLM framework KP-PLM. This framework can be flexibly combined with existing mainstream PLMs. Specifically, we first construct a knowledge sub-graph from KBs for each context. Then we design multiple continuous prompts rules and transform the knowledge sub-graph into natural language prompts. To further leverage the factual knowledge from these prompts, we propose two novel knowledge-aware self-supervised tasks including prompt relevance inspection and masked prompt modeling. Extensive experiments on multiple natural language understanding (NLU) tasks show the superiority of KP-PLM over other state-of-the-art methods in both full-resource and low-resource settings. Our source codes will be released upon the acceptance of the paper.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.116920471191406, 5.195946216583252], "id": 364, "title": "On the Evaluation Metrics for Paraphrase Generation", "authors": "Lingfeng Shen, Lemao Liu, Haiyun Jiang and Shuming Shi", "abstract": "In this paper we revisit automatic metrics for paraphrase evaluation and obtain two findings that disobey conventional wisdom: (1) Reference-free metrics achieve better performance than their reference-based counterparts. (2) Most commonly used metrics do not align well with human annotation.\nUnderlying reasons behind the above findings are explored through additional experiments and in-depth analyses.\nBased on the experiments and analyses, we propose ParaScore, a new evaluation metric for paraphrase generation. It possesses the merits of reference-based and reference-free metrics and explicitly models lexical divergence. Based on our analysis and improvements, our proposed reference-based outperforms than reference-free metrics.\nExperimental results demonstrate that ParaScore significantly outperforms existing metrics.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.389890670776367, 12.151315689086914], "id": 369, "title": "Curriculum Learning Meets Weakly Supervised Multimodal Correlation Learning", "authors": "Sijie Mai, Ya Sun and Haifeng Hu", "abstract": "In the field of multimodal sentiment analysis (MSA), a few studies have leveraged the inherent modality correlation information stored in samples for self-supervised learning. However, they feed the training pairs in a random order without consideration of difficulty. Without human annotation, the generated training pairs of self-supervised learning often contain noise. If noisy or hard pairs are used for training at the easy stage, the model might be stuck in bad local optimum. In this paper, we inject curriculum learning into weakly supervised multimodal correlation learning. The weakly supervised correlation learning leverages the label information to generate scores for negative pairs to learn a more discriminative embedding space, where negative pairs are defined as two unimodal embeddings from different samples. To assist the correlation learning, we feed the training pairs to the model according to difficulty by the proposed curriculum learning, which consists of elaborately designed scoring and feeding functions. The scoring function computes the difficulty of pairs using pre-trained and current correlation predictors, where the pairs with large losses are defined as hard pairs. Notably, the hardest pairs are discarded in our algorithm, which are assumed as noisy pairs. Moreover, the feeding function takes the difference of correlation losses as feedback to determine the feeding actions (`stay', `step back', or `step forward'). The proposed method reaches state-of-the-art performance on MSA.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [6.497754096984863, 1.8809715509414673], "id": 380, "title": "Rethinking Positional Encoding in Tree Transformer for Code Representation", "authors": "Han Peng, Ge Li, Yunfei Zhao and Zhi Jin", "abstract": "Transformers are now widely used in code representation, and several recent works further develop tree Transformers to capture the syntactic structure in source code. \nSpecifically, novel tree positional encodings have been proposed to incorporate inductive bias into Transformer.\nIn this work, we propose a novel tree Transformer encoding node positions based on our new description method for tree structures.\nTechnically, local and global soft bias shown in previous works is both introduced as positional encodings of our Transformer model.\nOur model finally outperforms strong baselines on code summarization and completion tasks across two languages, demonstrating our model's effectiveness.\nBesides, extensive experiments and ablation study shows that combining both local and global paradigms is still helpful in improving model performance. \nWe release our code at \\url{https://github.com/AwdHanPeng/TreeTransformer}.", "track": "NLP Applications", "label": 0}, {"loc": [1.3441051244735718, 4.924398422241211], "id": 381, "title": "RASAT: Integrating Relational Structures into Pretrained Seq2Seq Model for Text-to-SQL", "authors": "Jiexing Qi, Jingyao Tang, Ziwei He, Xiangpeng Wan, Yu Cheng, Chenghu Zhou, Xinbing Wang, Quanshi Zhang and Zhouhan Lin", "abstract": "Relational structures such as schema linking and schema encoding have been validated as a key component to qualitatively translating natural language into SQL queries. However, introducing these structural relations comes with prices: they often result in a specialized model structure, which largely prohibits using large pretrained models in text-to-SQL. To address this problem, we propose RASAT: a Transformer seq2seq architecture augmented with relation-aware self-attention that could leverage a variety of relational structures while inheriting the pretrained parameters from the T5 model effectively. Our model can incorporate almost all types of existing relations in the literature, and in addition, we propose introducing co-reference relations for the multi-turn scenario. Experimental results on three widely used text-to-SQL datasets, covering both single-turn and multi-turn scenarios, have shown that RASAT could achieve competitive results in all three benchmarks, achieving state-of-the-art execution accuracy (75.5% EX on Spider, 52.6% IEX on SParC, and 37.4% IEX on CoSQL).", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [1.0452271699905396, 10.523868560791016], "id": 395, "title": "COM-MRC: A COntext-Masked Machine Reading Comprehension Framework for Aspect Sentiment Triplet Extraction", "authors": "Zepeng Zhai, Hao Chen, Fangxiang Feng, Ruifan Li and Xiaojie WANG", "abstract": "Aspect Sentiment Triplet Extraction (ASTE) aims to extract sentiment triplets from sentences, which was recently formalized as an effective machine reading comprehension (MRC) based framework. However, when facing multiple aspect terms, the MRC-based methods could fail due to the interference from other aspect terms. In this paper, we propose a novel \\textit{COntext-Masked MRC} (COM-MRC) framework for ASTE. Our COM-MRC framework comprises three closely-related components: a context augmentation strategy, a discriminative model, and an inference method. Specifically, a context augmentation strategy is designed by enumerating all masked contexts for each aspect term. The discriminative model comprises four modules, i.e., aspect and opinion extraction modules, sentiment classification and aspect detection modules. In addition, a two-stage inference method first extracts all aspects and then identifies their opinions and sentiment through iteratively masking the aspects. Extensive experimental results on benchmark datasets show the effectiveness of our proposed COM-MRC framework, which outperforms state-of-the-art methods consistently.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.303220272064209, 7.505843162536621], "id": 405, "title": "CEM: Machine-Human Chatting Handoff via Causal-Enhance Module", "authors": "Shanshan Zhong, Jinghui Qin, Zhongzhan Huang and Daifeng Li", "abstract": "Aiming to ensure chatbot quality by predicting chatbot failure and enabling human-agent collaboration, Machine-Human Chatting Handoff (MHCH) has attracted lots of attention from both industry and academia in recent years. \nHowever, most existing methods mainly focus on the dialogue context or assist with global satisfaction prediction based on multi-task learning, which ignore the grounded relationships among the causal variables, like the user state and labor cost. These variables are significantly associated with handoff decisions, resulting in prediction bias and cost increasement. \nTherefore, we propose Causal-Enhance Module (CEM) by establishing the causal graph of MHCH based on these two variables, which is a simple yet effective module and can be easy to plug into the existing MHCH methods. \nFor the impact of users, we use the user state to correct the prediction bias according to the causal relationship of multi-task. For the labor cost, we train an auxiliary cost simulator to calculate unbiased labor cost through counterfactual learning so that a model becomes cost-aware.\nExtensive experiments conducted on four real-world benchmarks demonstrate the effectiveness of CEM in generally improving the performance of existing MHCH methods without any elaborated model crafting.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [8.039241790771484, 9.069884300231934], "id": 408, "title": "Nearest Neighbor Zero-Shot Inference", "authors": "Weijia Shi, Julian Michael, Suchin Gururangan and Luke Zettlemoyer", "abstract": "Retrieval-augmented language models (LMs) use non-parametric memory to substantially outperform their non-retrieval counterparts on perplexity-based evaluations, but it is an open question whether they achieve similar gains in few- and zero-shot end-task accuracy. We extensively study one such model, the k-nearest neighbor LM (kNN-LM), showing that the gains marginally transfer. The main challenge is to achieve coverage of the verbalizer tokens that define the different end-task class labels. To address this challenge, we also introduce kNN-Prompt, a simple and effective kNN-LM with automatically expanded fuzzy verbalizers (e.g. to expand \"terrible\u201d to also include \"silly\u201d and other task-specific synonyms for sentiment classification). Across nine diverse end-tasks, using kNN-Prompt with GPT-2 large yields significant performance boosts over strong zeroshot baselines (13.4% absolute improvement over the base LM on average). We also show that other advantages of non-parametric augmentation hold for end tasks; kNN-Prompt is effective for domain adaptation with no further training, and gains increase with the size of the retrieval model.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.4921956062316895, 7.134725093841553], "id": 415, "title": "Robots-Dont-Cry: Understanding Falsely Anthropomorphic Utterances in Dialog Systems", "authors": "David Gros, Yu Li and Zhou Yu", "abstract": "Dialog systems are often designed or trained to output human-like responses. However, some responses may be impossible for a machine to truthfully say (e.g. \"that movie made me cry\"). Highly anthropomorphic responses might make users uncomfortable or implicitly deceive them into thinking they are interacting with a human. \nWe collect human ratings on the feasibility of approximately 900 two-turn dialogs sampled from 9 diverse data sources. Ratings are for two hypothetical machine embodiments: a futuristic humanoid robot and a digital assistant. We find that for some data-sources commonly used to train dialog systems, 20-30\\% of utterances are not viewed as possible for a machine. Rating is marginally affected by machine embodiment. \nWe explore qualitative and quantitative reasons for these ratings. Finally, we build classifiers and explore how modeling configuration might affect output permissibly, and discuss implications for building less falsely anthropomorphic dialog systems.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.1115050315856934, 10.43939208984375], "id": 416, "title": "A Joint Learning Framework for Restaurant Survival Prediction and Explanation", "authors": "Xin Li, Xiaojie Zhang, Peng JiaHao, Rui Mao, Mingyang Zhou, Xing Xie and Hao Liao", "abstract": "The bloom of the Internet and the recent breakthroughs in deep learning techniques open a new door to AI for E-commence, with a trend of evolving from using a few financial factors such as liquidity and profitability to using more advanced AI techniques to process complex and multi-modal data. In this paper, we tackle the practical problem of restaurant survival prediction. We argue that traditional methods ignore two essential respects, which are very helpful for the task: 1) modeling customer reviews and 2) jointly considering status prediction and result explanation. Thus, we propose a novel joint learning framework for explainable restaurant survival prediction based on the multi-modal data of user-restaurant interactions and users' textual reviews. Moreover, we design a graph neural network to capture the high-order interactions and design a co-attention mechanism to capture the most informative and meaningful signal from noisy textual reviews. Our results on two datasets show a significant and consistent improvement over the SOTA techniques (average 6.8% improvement in prediction and 45.3% improvement in explanation).", "track": "NLP Applications", "label": 0}, {"loc": [8.11104965209961, 9.425466537475586], "id": 423, "title": "Making Pretrained Language Models Good Long-tailed Learners", "authors": "Chen Zhang, Lei Ren, Jingang Wang, Wei Wu and Dawei Song", "abstract": "Prompt-tuning has shown appealing performance in few-shot classification by virtue of its capability in effectively exploiting pre-trained knowledge. This motivates us to check the hypothesis that prompt-tuning is also a promising choice for long-tailed classification, since the tail classes are intuitively few-shot ones. To achieve this aim, we conduct empirical studies to examine the hypothesis. The results demonstrate that prompt-tuning makes pretrained language models at least good long-tailed learners. For intuitions on why prompt-tuning can achieve good performance in long-tailed classification, we carry out in-depth analyses by progressively bridging the gap between prompt-tuning and commonly used finetuning. The summary is that the classifier structure and parameterization form the key to making good long-tailed learners, in comparison with the less important input structure. Finally, we verify the applicability of our finding to few-shot classification.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.012414455413818, 3.992795467376709], "id": 425, "title": "UniGeo: Unifying Geometry Logical Reasoning via Reformulating Mathematical Expression", "authors": "Jiaqi Chen, Tong Li, Jinghui Qin, Pan Lu, Liang Lin, Chongyu Chen and Xiaodan Liang", "abstract": "Geometry problem solving is a well-recognized testbed for evaluating the high-level multi-modal reasoning capability of deep models. In most existing works, two main geometry problems: calculation and proving, are usually treated as two specific tasks, hindering a deep model to unify its reasoning capability on multiple math tasks. However, in essence, these two tasks have similar problem representations and overlapped math knowledge which can improve the understanding and reasoning ability of a deep model on both two tasks. Therefore, we construct a large-scale Unified Geometry problem benchmark, UniGeo, which contains 4,998 calculation problems and 9,543 proving problems. Each proving problem is annotated with a multi-step proof with reasons and mathematical expressions. The proof can be easily reformulated as a proving sequence that shares the same formats with the annotated program sequence for calculation problems. Naturally, we also present a unified multi-task Geometric Transformer framework, Geoformer, to tackle calculation and proving problems simultaneously in the form of sequence generation, which finally shows the reasoning ability can be improved on both two tasks by unifying formulation. Furthermore, we propose a Mathematical Expression Pretraining (MEP) method that aims to predict the mathematical expressions in the problem solution, thus improving the Geoformer model. Experiments on the UniGeo demonstrate that our proposed Geoformer obtains state-of-the-art performance by outperforming task-specific model NGS with over 5.6\\% and 3.2\\% accuracies on calculation and proving problems, respectively.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.404867172241211, 12.175616264343262], "id": 430, "title": "Face-Sensitive Image-to-Emotional-Text Cross-modal Translation for Multimodal Aspect-based Sentiment Analysis", "authors": "Hao Yang, Yanyan Zhao and Bing Qin", "abstract": "Aspect-level multimodal sentiment analysis, which aims to identify the sentiment of the target aspect from multimodal data, recently has attracted extensive attention in the community of multimedia and natural language processing. Despite the recent success in textual aspect-based sentiment analysis, existing models mainly focused on utilizing the object-level semantic information in the image but ignore explicitly using the visual emotional cues, especially the facial emotions. How to distill visual emotional cues and align them with the textual content remains a key challenge to solve the problem. In this work, we introduce a face-sensitive image-to-emotional-text translation (FITE) method, which focuses on capturing visual sentiment cues through facial expressions and selectively matching and fusing with the target aspect in textual modality. To the best of our knowledge, we are the first that explicitly utilize the emotional information from images in the multimodal aspect-based sentiment analysis task. Experiment results show that our method achieves state-of-the-art results on the Twitter-2015 and Twitter-2017 datasets. The improvement demonstrates the superiority of our model in capturing aspect-level sentiment in multimodal data with facial expressions.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.347741603851318, 7.282843112945557], "id": 431, "title": "FineD-Eval: Fine-grained Automatic Dialogue-Level Evaluation", "authors": "Chen Zhang, Luis Fernando D'Haro, Qiquan Zhang, Thomas Friedrichs and Haizhou Li", "abstract": "Recent model-based reference-free metrics for open-domain dialogue evaluation exhibit promising correlations with human judgment. However, they either perform turn-level evaluation or look at a single dialogue quality dimension. One would expect a good evaluation metric to assess multiple quality dimensions at the dialogue level. To this end, we are motivated to propose a multi-dimensional dialogue-level metric, which consists of three sub-metrics with each targeting a specific dimension. The sub-metrics are trained with novel self-supervised objectives and exhibit strong correlations with human judgment for their respective dimensions. Moreover, we explore two approaches to combine the sub-metrics: metric ensemble and multitask learning. Both approaches yield a holistic metric that significantly outperforms individual sub-metrics. Compared to the existing state-of-the-art metric, the combined metrics achieve around 16% relative improvement on average across three high-quality dialogue-level evaluation benchmarks.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.991034030914307, 5.812930107116699], "id": 438, "title": "Sentence Representation Learning with Generative Objective rather than Contrastive Objective", "authors": "Bohong Wu and Hai Zhao", "abstract": "Though offering amazing contextualized token-level representations, current pre-trained language models take less attention on accurately acquiring sentence-level representation during their self-supervised pre-training. However, contrastive objectives which dominate the current sentence representation learning bring little linguistic interpretability and no performance guarantee on downstream semantic tasks. We instead propose a novel generative self-supervised learning objective based on phrase reconstruction. To overcome the drawbacks of previous generative methods, we carefully model intra-sentence structure by breaking down one sentence into pieces of important phrases. Empirical studies show that our generative learning achieves powerful enough performance improvement and outperforms the current state-of-the-art contrastive methods not only on the STS benchmarks, but also on downstream semantic retrieval and reranking tasks. Our code is available at https://github.com/chengzhipanpan/PaSeR.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.665589332580566, 8.905779838562012], "id": 456, "title": "RLPrompt: Optimizing Discrete Text Prompts with Reinforcement Learning", "authors": "Mingkai Deng, Jianyu Wang, Cheng-Ping Hsieh, Yihan Wang, Han Guo, Tianmin Shu, Meng Song, Eric Xing and Zhiting Hu", "abstract": "Prompting has shown impressive success in enabling large pre-trained language models (LMs) to perform diverse NLP tasks, especially with only few downstream data. Automatically finding the optimal prompt for each task, however, is challenging. Most existing work resorts to tuning *soft* prompts (e.g., embeddings) which fall short of interpretability, reusability across LMs, and applicability when gradients are not accessible. *Discrete* prompts, on the other hand, are difficult to optimize, and are often created by \"enumeration (e.g., paraphrasing)-then-selection\" heuristics that do not explore the prompt space systematically. This paper proposes RLPrompt, an efficient discrete prompt optimization approach with reinforcement learning (RL). RLPrompt formulates a parameter-efficient policy network that generates the optimized discrete prompt after training with reward. To harness the complex and stochastic reward signals from the large LM environment, we incorporate effective reward stabilization that substantially enhances training efficiency. RLPrompt is flexibly applicable to different types of LMs, such as masked (e.g., BERT) and left-to-right models (e.g., GPTs), for both classification and generation tasks. Experiments on few-shot classification and unsupervised text style transfer show superior performance over a wide range of existing fine-tuning or prompting methods. Interestingly, the resulting optimized prompts are often ungrammatical gibberish text; and surprisingly, those gibberish prompts are transferrable between different LMs to retain significant performance, indicating that LM prompting may not follow human language patterns.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.8519182205200195, 8.786553382873535], "id": 462, "title": "DisCup: Discriminator Cooperative Unlikelihood Prompt-tuning for Controllable Text Generation", "authors": "Hanqing Zhang and Dawei Song", "abstract": "Prompt learning with immensely large Casual Language Models (CLMs) has been shown promising for attribute-controllable text generation (CTG). However, vanilla prompt tuning tends to imitate training corpus characteristics beyond the control attributes, resulting in a poor generalization ability. Moreover, it is less able to capture the relationship between different attributes, further limiting the control performance. In this paper, we propose a new CTG approach, namely DisCup, which incorporates the attribute knowledge of discriminator to optimize the control-prompts, steering a frozen CLM to produce attribute-specific texts. Specifically, the frozen CLM model, capable of producing multitudinous texts, is first used to generate the next-token candidates based on the context, so as to ensure the diversity of tokens to be predicted. Then, we leverage an attribute-discriminator to select desired/undesired tokens from those candidates, providing the inter-attribute knowledge. Finally, we bridge the above two traits by an unlikelihood objective for prompt-tuning. Extensive experimental results show that DisCup can achieve a new state-of-the-art control performance while maintaining an efficient and high-quality text generation, only relying on around 10 virtual tokens.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.476661205291748, 12.111429214477539], "id": 472, "title": "CPL: Counterfactual Prompt Learning for Vision and Language Models", "authors": "Xuehai He, Diji Yang, Weixi Feng, Tsu-Jui Fu, Arjun Akula, Varun Jampani, Pradyumna Narayana, Sugato Basu, William Yang Wang and Xin Eric Wang", "abstract": "Prompt tuning is a new few-shot transfer learning technique that only tunes the learnable prompt for pre-trained vision and language models such as CLIP. However, existing prompt tuning methods tend to learn spurious or entangled representations, which leads to poor generalization to unseen concepts.\nTowards non-spurious and efficient prompt learning from limited examples, this paper presents a novel Counterfactual Prompt Learning (CPL) method for vision and language models, which simultaneously employs counterfactual generation and contrastive learning in a joint optimization framework.\nParticularly, CPL constructs counterfactual by identifying minimal non-spurious feature change between semantically-similar positive and negative samples that causes concept change, and learns more generalizable prompt representation from both factual and counterfactual examples via contrastive learning. Extensive experiments demonstrate that CPL can obtain superior few-shot performance on different vision and language tasks than previous prompt tuning methods on CLIP. On image classification, we achieve 3.55% average relative improvement on unseen classes across seven datasets; on image-text retrieval and visual question answering, we gain up to 4.09% and 25.08% relative improvements across three few-shot scenarios on unseen test sets respectively.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.832625865936279, 7.349550724029541], "id": 475, "title": "Red Teaming Language Models with Language Models", "authors": "Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese and Geoffrey Irving", "abstract": "Language Models (LMs) often cannot be deployed because of their potential to harm users in hard-to-predict ways. Prior work identifies harmful behaviors before deployment by using human annotators to hand-write test cases. However, human annotation is expensive, limiting the number and diversity of test cases. In this work, we automatically find cases where a target LM behaves in a harmful way, by generating test cases (\"red teaming\") using another LM. We evaluate the target LM's replies to generated test questions using a classifier trained to detect offensive content, uncovering tens of thousands of offensive replies in a 280B parameter LM chatbot. We explore several methods, from zero-shot generation to reinforcement learning, for generating test cases with varying levels of diversity and difficulty. Furthermore, we use prompt engineering to control LM-generated test cases to uncover a variety of other harms, automatically finding groups of people that the chatbot discusses in offensive ways, personal and hospital phone numbers generated as the chatbot's own contact info, leakage of private training data in generated text, and harms that occur over the course of a conversation. Overall, LM-based red teaming is one promising tool (among many needed) for finding and fixing diverse, undesirable LM behaviors before impacting users.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.067829132080078, 12.552175521850586], "id": 478, "title": "CapOnImage: Context-driven Dense-Captioning on Image", "authors": "Yiqi Gao, Xinglin Hou, Yuanmeng Zhang, Tiezheng Ge, Yuning Jiang and peng wang", "abstract": "Existing image captioning systems are dedicated to generating narrative captions for images, which are spatially detached from the\nimage in presentation. However, texts can also be used as decorations on the image to highlight the key points and increase the\nattractiveness of images. In this work, we introduce a new task\ncalled captioning on image (CapOnImage), which aims to generate\ndense captions at different locations of the image based on contextual information. To fully exploit the surrounding visual context to\ngenerate the most suitable caption for each location, we propose a\nmulti-modal pre-training model with multi-level pre-training tasks\nthat progressively learn the correspondence between texts and image locations from easy to difficult. Since the model may generate\nredundant captions for nearby locations, we further enhance the\nlocation embedding with neighbor locations as context. For this\nnew task, we also introduce a large-scale benchmark called CapOnImage2M, which contains 2.1 million product images, each with an\naverage of 4.8 spatially localized captions. Compared with other image captioning model variants, our model achieves the best results\nin both captioning accuracy and diversity aspects.", "track": "Natural Language Generation", "label": 6}, {"loc": [1.7430247068405151, 8.718894004821777], "id": 516, "title": "SpanProto: A Two-stage Span-based Prototypical Network for Few-shot Named Entity Recognition", "authors": "Jianing Wang, Chengyu Wang, Chuanqi Tan, Minghui Qiu, Songfang Huang, jun huang and Ming Gao", "abstract": "Few-shot Named Entity Recognition (NER) aims to identify named entities with very little annotated data. Previous methods solve this problem based on token-wise classification, which ignores the information of entity boundaries, and inevitably the performance is affected by the massive non-entity tokens. To this end, we propose a seminal span-based prototypical network (SpanProto) that tackles few-shot NER via a two-stage approach, including span extraction and mention classification. \nIn the span extraction stage, we transform the sequential tags into a global boundary matrix, enabling the model to focus on the explicit boundary information. For mention classification, we leverage prototypical learning to capture the semantic representations for each labeled span and make the model better adapt to novel-class entities. To further improve the model performance, we split out the false positives generated by the span extractor but not labeled in the current episode set, and then present a margin-based loss to separate them from each prototype region. Experiments over multiple benchmarks demonstrate that our model outperforms strong baselines by a large margin.", "track": "Information Extraction", "label": 5}, {"loc": [6.6595611572265625, 5.773978233337402], "id": 518, "title": "Discovering Differences in the Representation of People using Contextualized Semantic Axes", "authors": "Li Lucy, Divya Tadimeti and David Bamman", "abstract": "A common paradigm for identifying semantic differences across social and temporal contexts is the use of static word embeddings and their distances. In particular, past work has compared embeddings against \"semantic axes\" that represent two opposing concepts. We extend this paradigm to BERT embeddings, and construct contextualized axes that mitigate the pitfall where antonyms have neighboring representations. We validate and demonstrate these axes on two people-centric datasets: occupations from Wikipedia, and multi-platform discussions in extremist, men's communities over fourteen years. In both studies, contextualized semantic axes can characterize differences among instances of the same word type. In the latter study, we show that references to women and the contexts around them have become more detestable over time.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.486772537231445, 5.470057964324951], "id": 527, "title": "Generating Literal and Implied Subquestions to Fact-check Complex Claims", "authors": "Jifan Chen, Aniruddh Sriram, Eunsol Choi and Greg Durrett", "abstract": "Verifying political claims is a challenging task, as politicians can use various tactics to subtly misrepresent the facts for their agenda. Existing automatic fact-checking systems fall short here, and their predictions like \"half-true'' are not very useful in isolation, since it is unclear which parts of a claim are true and which are not. In this work, we focus on decomposing a complex claim into a comprehensive set of yes-no subquestions whose answers influence the veracity of the claim. We present CLAIMDECOMP, a dataset of decompositions for over 1000 claims. Given a claim and its verification paragraph written by fact-checkers, our trained annotators write subquestions covering both explicit propositions of the original claim and its implicit facets, such as asking about additional political context that changes our view of the claim's veracity. We study whether state-of-the-art models can generate such subquestions, showing that these models generate reasonable questions to ask, but predicting the comprehensive set of subquestions from the original claim without evidence remains challenging. We further show that these subquestions can help identify relevant evidence to fact-check the full claim and derive the veracity through their answers, suggesting that they can be useful pieces of a fact-checking pipeline.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [10.97620677947998, 6.7964396476745605], "id": 529, "title": "Machine Translation Robustness to Natural Asemantic Variation", "authors": "Jacob Bremerman, Xiang Ren and Jonathan May", "abstract": "Current Machine Translation (MT) models still struggle with more challenging input, such as noisy data and tail-end words and phrases. Several works have addressed this robustness issue by identifying specific categories of noise and variation then tuning models to perform better on them. An important yet under-studied category involves minor variations in nuance (non-typos) that preserve meaning w.r.t. the target language. We introduce and formalize this category as Natural Asemantic Variation (NAV) and investigate it in the context of MT robustness. We find that existing MT models fail when presented with NAV data, but we demonstrate strategies to improve performance on NAV by fine-tuning them with human-generated variations. We also show that NAV robustness can be transferred across languages and find that synthetic perturbations can achieve some but not all of the benefits of organic NAV data.", "track": "Machine Translation", "label": 10}, {"loc": [6.49038553237915, 1.8903164863586426], "id": 531, "title": "Natural Language to Code Translation with Execution", "authors": "Freda Shi, Daniel Fried, Marjan Ghazvininejad, Luke Zettlemoyer and Sida I. Wang", "abstract": "Generative models of code, pretrained on large corpora of programs, have shown great success in translating natural language to code (Chen et al., 2021; Austin et al., 2021; Li et al., 2022, inter alia). While these models do not explicitly incorporate program semantics (i.e., execution results) during training, they are able to generate correct solutions for many problems. However, choosing a single correct program from a generated set for each problem remains challenging. In this work, we introduce execution result--based minimum Bayes risk decoding (MBR-EXEC) for program selection and show that it improves the few-shot performance of pretrained code models on natural-language-to-code tasks. We select output programs from a generated candidate set by marginalizing over program implementations that share the same semantics. Because exact equivalence is intractable, we execute each program on a small number of test inputs to approximate semantic equivalence. Across datasets, execution or simulated execution significantly outperforms the methods that do not involve program semantics. We find that MBR-EXEC consistently improves over all execution-unaware selection methods, suggesting it as an effective approach for natural language to code translation.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.792774677276611, 3.505635976791382], "id": 536, "title": "Life is a Circus and We are the Clowns: Automatically Finding Analogies between Situations and Processes", "authors": "Oren Sultan and Dafna Shahaf", "abstract": "Analogy-making gives rise to reasoning, abstraction, flexible categorization and counterfactual inference -- abilities lacking in even the best AI systems today. Much research has suggested that analogies are key to non-brittle systems that can adapt to new domains. Despite their importance, analogies received little attention in the NLP community, with most research focusing on simple word analogies. Work that tackled more complex analogies relied heavily on manually constructed, hard-to-scale input representations.\nIn this work, we explore a more realistic, challenging setup: our input is a pair of natural language procedural texts, describing a situation or a process (e.g., how the heart works/how a pump works). \nOur goal is to automatically extract entities and their relations from the text and find a mapping between the different domains based on relational similarity (e.g., blood is mapped to water). \n\nWe develop an interpretable, scalable algorithm and demonstrate that it identifies the correct mappings 87% of the time for procedural texts and 94% for stories from cognitive-psychology literature. We show it can extract analogies from a large dataset of procedural texts, achieving 79% precision (analogy prevalence in data: 3%). Lastly, we demonstrate that our algorithm is robust to paraphrasing the input texts", "track": "NLP Applications", "label": 0}, {"loc": [9.133277893066406, 6.43787956237793], "id": 547, "title": "Language Contamination Helps Explains the Cross-lingual Capabilities of English Pretrained Models", "authors": "Terra Blevins and Luke Zettlemoyer", "abstract": "English pretrained language models, which make up the backbone of many modern NLP systems, require huge amounts of unlabeled training data. These models are generally presented as being trained only on English text but have been found to transfer surprisingly well to other languages. We investigate this phenomenon and find that common English pretraining corpora actually contain significant amounts of non-English text: even when less than 1% of data is not English (well within the error rate of strong language classifiers), this leads to hundreds of millions of foreign language tokens in large-scale datasets. We then demonstrate that even these small percentages of non-English data facilitate cross-lingual transfer for models trained on them, with target language performance strongly correlated to the amount of in-language data seen during pretraining. In light of these findings, we argue that no model is truly monolingual when pretrained at scale, which should be considered when evaluating cross-lingual transfer.", "track": "Multilinguality", "label": 13}, {"loc": [9.101027488708496, 6.344759464263916], "id": 548, "title": "Analyzing the Mono- and Cross-Lingual Pretraining Dynamics of Multilingual Language Models", "authors": "Terra Blevins, Hila Gonen and Luke Zettlemoyer", "abstract": "The emergent cross-lingual transfer seen in multilingual pretrained models has sparked significant interest in studying their behavior. However, because these analyses have focused on fully trained multilingual models, little is known about the dynamics of the multilingual pretraining process. We investigate when these models acquire their in-language and cross-lingual abilities by probing checkpoints taken from throughout XLM-R pretraining, using a suite of linguistic tasks. Our analysis shows that the model achieves high in-language performance early on, with lower-level linguistic skills acquired before more complex ones. In contrast, the point in pretraining when the model learns to transfer cross-lingually differs across language pairs. Interestingly, we also observe that, across many languages and tasks, the final model layer exhibits significant performance degradation over time, while linguistic knowledge propagates to lower layers of the network. Taken together, these insights highlight the complexity of multilingual pretraining and the resulting varied behavior for different languages over time.", "track": "Multilinguality", "label": 13}, {"loc": [10.51369857788086, 7.22982931137085], "id": 566, "title": "Neural Machine Translation with Contrastive Translation Memories", "authors": "Xin Cheng, Shen Gao, Lemao Liu, Dongyan Zhao and Rui Yan", "abstract": "Retrieval-augmented Neural Machine Translation models have been successful in many translation scenarios. Different from previous works that make use of mutually similar but redundant translation memories~(TMs), we propose a new retrieval-augmented NMT to model contrastively retrieved translation memories that are holistically similar to the source sentence while individually contrastive to each other providing maximal information gain in three phases. First, in TM retrieval phase, we adopt contrastive retrieval algorithm to avoid redundancy and uninformativeness of similar translation pieces. Second, in memory encoding stage, given a set of TMs we propose a novel Hierarchical Group Attention module to gather both local context of each TM and global context of the whole TM set. Finally, in training phase, a Multi-TM contrastive learning objective is introduced to learn salient feature of each TM with respect to target sentence. Experimental results show that our framework obtains substantial improvements over strong baselines in the benchmark dataset.", "track": "Machine Translation", "label": 10}, {"loc": [1.6771990060806274, 8.707792282104492], "id": 569, "title": "Distilling Causal Effect from Miscellaneous Other-Class for Continual Named Entity Recognition", "authors": "Junhao Zheng, Zhanxian Liang, Haibin Chen and Qianli Ma", "abstract": "Continual Learning for Named Entity Recognition (CL-NER) aims to learn a growing number of entity types over time from a stream of data. However, simply learning Other-Class in the same way as new entity types amplifies the catastrophic forgetting and leads to a substantial performance drop. The main cause behind this is that Other-Class samples usually contain old entity types, and the old knowledge in these Other-Class samples is not preserved properly. Thanks to the causal inference, we identify that the forgetting is caused by the missing causal effect from the old data.To this end, we propose a unified causal framework to retrieve the causality from both new entity types and Other-Class.Furthermore, we apply curriculum learning to mitigate the impact of label noise and introduce a self-adaptive weight for balancing the causal effects between new entity types and Other-Class. Experimental results on three benchmark datasets show that our method outperforms the state-of-the-art method by a large margin. Moreover, our method can be combined with the existing state-of-the-art methods to improve the performance in CL-NER.", "track": "NLP Applications", "label": 0}, {"loc": [7.22475528717041, 6.848784446716309], "id": 570, "title": "Exploring the Secrets Behind the Learning Difficulty of Meaning Representations for Semantic Parsing", "authors": "Zhenwen Li, Jiaqi Guo, Qian Liu, Jian-Guang LOU and Tao Xie", "abstract": "Previous research has shown that the design of Meaning Representation (MR) greatly influences the final model performance of a neural semantic parser. Therefore, designing a good MR is a long-term goal for semantic parsing. However, it is still an art as there is no quantitative indicator that can tell us which MR among a set of candidates may have the best final model performance. In practice, in order to\nselect an MR design, researchers often have to go through the whole training-testing process for all design candidates, and the process often costs a lot. In this paper, we propose a data-aware metric called ISS (denoting incremental structural stability) of MRs, and demonstrate that ISS is highly correlated with the final performance. The finding shows that ISS can be used as an indicator for MR design to avoid the costly training-testing process.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.544415473937988, 12.263265609741211], "id": 577, "title": "That's the Wrong Lung! Evaluating and Improving the Interpretability of Unsupervised Multimodal Encoders for Medical Data", "authors": "Jered McInerney, Geoffrey Young, Jan-Willem van de Meent and Byron Wallace", "abstract": "Pretraining multimodal models on Electronic Health Records (EHRs) provides a means of learning representations that can transfer to downstream tasks with minimal supervision. Recent multimodal models induce soft local alignments between image regions and sentences. This is of particular interest in the medical domain, where alignments might highlight regions in an image relevant to specific phenomena described in free-text. While past work has suggested that attention \"heatmaps\u201d can be interpreted in this manner, there has been little evaluation of such alignments. We compare alignments from a state-of-the-art multimodal (image and text) model for EHR with human annotations that link image regions to sentences. Our main finding is that the text has an often weak or unintuitive influence on attention; alignments do not consistently reflect basic anatomical information. Moreover, synthetic modifications \u2014 such as substituting \"left\u201d for \"right\u201d \u2014 do not substantially influence highlights. Simple techniques such as allowing the model to opt out of attending to the image and few-shot finetuning show promise in terms of their ability to improve alignments with very little or no supervision. We make our code and checkpoints open-source.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.765167236328125, 6.520447731018066], "id": 582, "title": "Unsupervised Tokenization Learning", "authors": "Anton Kolonin and Vignav Ramesh", "abstract": "In the presented study, we discover that the so-called \"transition freedom\u201d metric appears superior for unsupervised tokenization purposes in comparison to statistical metrics such as mutual information and conditional probability, providing F-measure scores in range from 0.71 to 1.0 across explored multilingual corpora. We find that different languages require different offshoots of that metric (such as derivative, variance, and \"peak values\u201d) for successful tokenization. Larger training corpora do not necessarily result in better tokenization quality, while compressing the models by eliminating statistically weak evidence tends to improve performance. The proposed unsupervised tokenization technique provides quality better than or comparable to lexicon-based ones, depending on the language.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [10.656447410583496, 6.976902961730957], "id": 590, "title": "A Template-based Method for Constrained Neural Machine Translation", "authors": "Shuo Wang, Peng Li, Zhixing Tan, Zhaopeng Tu, Maosong Sun and Yang Liu", "abstract": "Machine translation systems are expected to cope with various types of constraints in many practical scenarios. While neural machine translation (NMT) has achieved strong performance in unconstrained cases, it is non-trivial to impose pre-specified constraints into the translation process of NMT models. Although many approaches have been proposed to address this issue, most existing methods can not satisfy the following three desiderata at the same time: (1) high translation quality, (2) high match accuracy, and (3) low latency. In this work, we propose a template-based method that can yield results with high translation quality and match accuracy and the inference speed of our method is comparable with unconstrained NMT models. Our basic idea is to rearrange the generation of constrained and unconstrained tokens through a template. Our method does not require any changes in the model architecture and the decoding algorithm. Experimental results show that the proposed template-based approach can outperform several representative baselines in both lexically and structurally constrained translation tasks.", "track": "Machine Translation", "label": 10}, {"loc": [8.615113258361816, 8.294707298278809], "id": 597, "title": "PATS: Sensitivity-aware Noisy Learning for Pretrained Language Models", "authors": "Yupeng Zhang, Hongzhi Zhang, Sirui Wang, Wei Wu and Zhoujun Li", "abstract": "A wide range of NLP tasks benefit from the fine-tuning of pretrained language models (PLMs). However, a number of redundant parameters which contribute less to the downstream task are observed in a directly fine-tuned model. We consider the gap between pretraining and downstream tasks hinders the training of these redundant parameters, and results in a suboptimal performance of the overall model. In this paper, we present PATS (Perturbation According To Sensitivity), a noisy training mechanism which considers each parameter's importance in the downstream task to help fine-tune PLMs. The main idea of PATS is to add bigger noise to parameters with lower sensitivity and vice versa, in order to activate more parameters' contributions to downstream tasks without affecting the sensitive ones much. Extensive experiments conducted on different tasks of the GLUE benchmark show PATS can consistently empower the fine-tuning of different sizes of PLMs, and the parameters in the well-performing models always have more concentrated distributions of sensitivities, which experimentally proves the effectiveness of our method.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.902408123016357, 9.890228271484375], "id": 611, "title": "Towards Reinterpreting Neural Topic Models via Composite Activations", "authors": "Jia Peng Lim and Hady Lauw", "abstract": "Most Neural Topic Models (NTM) use a variational auto-encoder framework producing K topics limited to the size of the encoder's output. These topics are interpreted through the selection of the top activated words via the weights or reconstructed vector of the decoder that are directly connected to each neuron. In this paper, we present a model-free two-stage process to reinterpret NTM and derive further insights on the state of the trained model. Firstly, building on the original information from a trained NTM, we generate a pool of potential candidate \"composite topics\" by exploiting possible co-occurrences within the original set of topics, which decouples the strict interpretation of topics from the original NTM. This is followed by a combinatorial formulation to select a final set of composite topics, which we evaluate for coherence and diversity on a large external corpus. Lastly, we employ a user study to derive further insights on the reinterpretation process.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.785860061645508, 9.92243480682373], "id": 630, "title": "Few-shot Query-Focused Summarization with Prefix-Merging", "authors": "Ruifeng Yuan, zili Wang, Ziqiang Cao and Wenjie Li", "abstract": "Query-focused summarization has been considered as an important extension for text summarization. It aims to generate a concise highlight for a given query. Different from text summarization, query-focused summarization has long been plagued by the problem of lacking high-quality large-scale datasets. In this paper, we investigate the idea that whether we can integrate and transfer the knowledge of text summarization and question answering to assist the few-shot learning in query-focused summarization. Here, we propose prefix-merging, a prefix-based pretraining strategy for few-shot learning in query-focused summarization. Drawn inspiration from prefix-tuning, we are allowed to integrate the task knowledge from text summarization and question answering into a properly designed prefix and apply the merged prefix to query-focused summarization. With only a small amount of trainable parameters, prefix-merging outperforms fine-tuning on query-focused summarization. We further discuss the influence of different prefix designs and propose a visualized explanation for how prefix-merging works.", "track": "Summarization", "label": 14}, {"loc": [9.69129753112793, 6.506754398345947], "id": 638, "title": "Cross-Align: Modeling Deep Cross-lingual Interactions for Word Alignment", "authors": "Siyu Lai, Zhen Yang, Fandong Meng, Yufeng Chen, Jinan Xu and Jie Zhou", "abstract": "Word alignment which aims to extract lexicon translation equivalents between source and target sentences, serves as a fundamental tool for natural language processing. Recent studies in this area have yielded substantial improvements by generating alignments from contextualized embeddings of the pre-trained multilingual language models. However, we find that the existing approaches capture few interactions between the input sentence pairs, which degrades the word alignment quality severely, especially for the ambiguous words in the monolingual context. To remedy this problem, we propose Cross-Align to model deep interactions between the input sentence pairs, in which the source and target sentences are encoded separately with the shared self-attention modules in the shallow layers, while cross-lingual interactions are explicitly constructed by the cross-attention modules in the upper layers. Besides, to train our model effectively, we propose a two-stage training framework, where the model is trained with a simple Translation Language Modeling (TLM) objective in the first stage and then finetuned with a self-supervised alignment objective in the second stage. Experiments show that the proposed Cross-Align achieves the state-of-the-art (SOTA) performance on four out of five language pairs.", "track": "Multilinguality", "label": 13}, {"loc": [5.557558536529541, 8.130949974060059], "id": 663, "title": "BERTScore is Unfair: On Social Bias in Language Model-Based Metrics for Text Generation", "authors": "Tianxiang Sun, Junliang He, Xipeng Qiu and Xuanjing Huang", "abstract": "Automatic evaluation metrics are crucial to the development of generative systems. In recent years, pre-trained language model (PLM) based metrics, such as BERTScore, have been commonly adopted in various generation tasks. However, it has been demonstrated that PLMs encode a range of stereotypical societal biases, leading to a concern about the fairness of PLMs as metrics. To that end, this work presents the first systematic study on the social bias in PLM-based metrics. We demonstrate that popular PLM-based metrics exhibit significantly higher social bias than traditional metrics on 6 sensitive attributes, namely race, gender, religion, physical appearance, age, and socioeconomic status. In-depth analysis suggests that choosing paradigms (matching, regression, or generation) of the metric has a greater impact on fairness than choosing PLMs. In addition, we develop debiasing adapters that are injected into PLM layers, mitigating bias in PLM-based metrics while retaining high performance for evaluating text generation.", "track": "Ethics", "label": 21}, {"loc": [7.265092372894287, 9.717418670654297], "id": 667, "title": "HPT: Hierarchy-aware Prompt Tuning for Hierarchical Text Classification", "authors": "Zihan Wang, Peiyi Wang, Tianyu Liu, Binghuai Lin, Yunbo Cao, Zhifang Sui and Houfeng Wang", "abstract": "Hierarchical text classification (HTC) is a challenging subtask of multi-label classification due to its complex label hierarchy.\nRecently, the pretrained language models (PLM)\nhave been widely adopted in HTC through a fine-tuning paradigm. However, in this paradigm, there exists a huge gap between the classification tasks with sophisticated label hierarchy and the masked language model (MLM) pretraining tasks of PLMs and thus the potential of PLMs cannot be fully tapped.\nTo bridge the gap, in this paper, we propose HPT, a Hierarchy-aware Prompt Tuning method to handle HTC from a multi-label MLM perspective.\nSpecifically, we construct a dynamic virtual template and label words that take the form of soft prompts to fuse the label hierarchy knowledge and introduce a zero-bounded multi-label cross-entropy loss to harmonize the objectives of HTC and MLM.\nExtensive experiments show HPT achieves state-of-the-art performances on 3 popular HTC datasets and is adept at handling the imbalance and low resource situations. Our code is available at https://github.com/wzh9969/HPT.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.052157402038574, 8.709228515625], "id": 674, "title": "Not to Overfit or Underfit the Source Domains? An Empirical Study of Domain Generalization in Question Answering", "authors": "Md Arafat Sultan, Avi Sil and Radu Florian", "abstract": "Machine learning models are prone to overfitting their training (source) domains, which is commonly believed to be the reason why they falter in novel target domains. Here we examine the contrasting view that multi-source domain generalization (DG) is first and foremost a problem of mitigating source domain underfitting: models not adequately learning the signal already present in their multi-domain training data. Experiments on a reading comprehension DG benchmark show that as a model learns its source domains better\u2014using familiar methods such as knowledge distillation (KD) from a bigger model\u2014its zero-shot out-of-domain utility improves at an even faster pace. Improved source domain learning also demonstrates superior out-of-domain generalization over three popular existing DG approaches that aim to limit overfitting. Our implementation of KD-based domain generalization is available via PrimeQA at: https://ibm.biz/domain-generalization-with-kd.", "track": "Question Answering", "label": 11}, {"loc": [5.983711242675781, 6.207870960235596], "id": 681, "title": "Neural Theory-of-Mind? On the Limits of Social Intelligence in Large LMs", "authors": "Maarten Sap, Ronan Le Bras, Daniel Fried and Yejin Choi", "abstract": "Social intelligence and Theory of Mind (TOM), i.e., the ability to reason about the different mental states, intents, and reactions of all people involved, allows humans to effectively navigate and understand everyday social interactions. As NLP systems are used in increasingly complex social situations, their ability to grasp social dynamics becomes crucial.\n\nIn this work, we examine the open question of social intelligence and Theory of Mind in modern NLP systems from an empirical and theorybased perspective. We show that one of today's largest language models (GPT-3; Brown et al., 2020) lacks this kind of social intelligence out-of-the box, using two tasks: SocialIQa (Sap et al., 2019), which measure models' ability to understand intents and reactions of participants of social interactions, and ToMi (Le, Boureau, and Nickel, 2019), which measures whether models can infer mental states and realities of participants of situations.\n\nOur results show that models struggle substantially at these Theory of Mind tasks, with well-below-human accuracies of 55% and 60% on SocialIQa and ToMi, respectively. To conclude, we draw on theories from pragmatics to contextualize this shortcoming of large language models, by examining the limitations stemming from their data, neural architecture, and training paradigms. Challenging the prevalent narrative that only scale is needed, we posit that person-centric NLP approaches might be more effective towards neural Theory of Mind.", "track": "Theme Track", "label": 18}, {"loc": [2.5048956871032715, 4.591599941253662], "id": 683, "title": "Improving Passage Retrieval with Zero-Shot Question Generation", "authors": "Devendra Sachan, Mike Lewis, Mandar Joshi, Armen Aghajanyan, Wen-tau Yih, Joelle Pineau and Luke Zettlemoyer", "abstract": "We propose a simple and effective re-ranking method for improving passage retrieval in open question answering. The re-ranker re-scores retrieved passages with a zero-shot question generation model, which uses a pre-trained language model to compute the probability of the input question conditioned on a retrieved passage. This approach can be applied on top of any retrieval method (e.g. neural or keyword-based), does not require any domain- or task-specific training (and therefore is expected to generalize better to data distribution shifts), and provides rich cross-attention between query and passage (i.e. it must explain every token in the question). When evaluated on a number of open-domain retrieval datasets, our re-ranker improves strong unsupervised retrieval models by 6%-18% absolute and strong supervised models by up to 12% in terms of top-20 passage retrieval accuracy. We also obtain new state-of-the-art results on full open-domain question answering by simply adding the new re-ranker to existing models with no further changes.", "track": "Question Answering", "label": 11}, {"loc": [3.701219081878662, 9.476897239685059], "id": 684, "title": "Summarizing Community-based Question-Answer Pairs", "authors": "Ting-Yao Hsu, Yoshi Suhara and Xiaolan Wang", "abstract": "Community-based Question Answering (CQA), which allows users to acquire their desired information, has increasingly become an essential component of online services in various domains such as E-commerce, travel, and dining. However, an overwhelming number of CQA pairs makes it difficult for users without particular intent to find useful information spread over CQA pairs. To help users quickly digest the key information, we propose the novel CQA summarization task that aims to create a concise summary from CQA pairs. To this end, we first design a multi-stage data annotation process and create a benchmark dataset, COQASUM, based on the Amazon QA corpus. We then compare a collection of extractive and abstractive summarization methods and establish a strong baseline approach DedupLED for the CQA summarization task. Our experiment further confirms two key challenges, sentence-type transfer and deduplication removal, towards the CQA summarization task. Our data and code are publicly available.", "track": "Summarization", "label": 14}, {"loc": [4.708431243896484, 4.635732650756836], "id": 687, "title": "Logical Reasoning with Span-Level Predictions for Interpretable and Robust NLI Models", "authors": "Joe Stacey, Pasquale Minervini, Haim Dubossarsky and Marek Rei", "abstract": "Current Natural Language Inference (NLI) models achieve impressive results, sometimes outperforming humans when evaluating on in-distribution test sets. However, as these models are known to learn from annotation artefacts and dataset biases, it is unclear to what extent the models are learning the task of NLI instead of learning from shallow heuristics in their training data.\n\nWe address this issue by introducing a logical reasoning framework for NLI, creating highly transparent model decisions that are based on logical rules. Unlike prior work, we show that improved interpretability can be achieved without decreasing the predictive accuracy. We almost fully retain performance on SNLI, while also identifying the exact hypothesis spans that are responsible for each model prediction.\n\nUsing the e-SNLI human explanations, we verify that our model makes sensible decisions at a span level, despite not using any span labels during training. We can further improve model performance and the span-level decisions by using the e-SNLI explanations during training. Finally, our model is more robust in a reduced data setting. When training with only 1,000 examples, out-of-distribution performance improves on the MNLI matched and mismatched validation sets by 13% and 16% relative to the baseline. Training with fewer observations yields further improvements, both in-distribution and out-of-distribution.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.543882846832275, 6.71017599105835], "id": 688, "title": "How to disagree well: Investigating the dispute tactics used on Wikipedia", "authors": "Christine De Kock and Andreas Vlachos", "abstract": "Disagreements are frequently studied from the perspective of either detecting toxicity or analysing argument structure. We propose a framework of dispute tactics which unifies these two perspectives, as well as other dialogue acts which play a role in resolving disputes, such as asking questions and providing clarification. This framework includes a preferential ordering among rebuttal-type tactics, ranging from ad hominem attacks to refuting the central argument. Using this framework, we annotate 213 disagreements (3,865 utterances) from Wikipedia Talk pages. This allows us to investigate research questions around the tactics used in disagreements; for instance, we provide empirical validation of the approach to disagreement recommended by Wikipedia. We develop models for multilabel prediction of dispute tactics in an utterance, achieving the best performance with a transformer-based label powerset model. Adding an auxiliary task to incorporate the ordering of rebuttal tactics further yields a statistically significant increase. Finally, we show that these annotations can be used to provide useful additional signals to improve performance on the task of predicting escalation.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [3.925905466079712, 9.288348197937012], "id": 696, "title": "Chapter Ordering in Novels", "authors": "Allen Kim and Steve Skiena", "abstract": "Understanding narrative flow and text coherence in long-form documents (novels) remains an open problem in NLP.\nTo gain insight, we explore the task of chapter ordering, reconstructing the original order of chapters in novel given a random permutation of the text. This can be seen as extending the well-known sentence ordering task to vastly larger documents: our task deals with over 9,000 novels with an average of twenty chapters each, versus standard sentence ordering datasets averaging only 5-8 sentences. We formulate the task of reconstructing order as a constraint solving problem, using minimum feedback arc set and traveling salesman problem optimization criteria, where the weights of the graph are generated based on models for character occurrences and chapter boundary detection, using relational chapter scores derived from RoBERTa. Our best methods yield a Spearman correlation of 0.59 on this novel and challenging task, substantially above baseline.", "track": "NLP Applications", "label": 0}, {"loc": [3.597039222717285, 4.510239124298096], "id": 699, "title": "Open-ended Knowledge Tracing for Computer Science Education", "authors": "Naiming Liu, Richard Baraniuk, Andrew Lan and Zichao Wang", "abstract": "In educational applications, knowledge tracing refers to the problem of estimating students' time-varying concept/skill mastery level from their past responses to questions and predicting their future performance.\nOne key limitation of most existing knowledge tracing methods is that they treat student responses to questions as binary-valued, i.e., whether they are correct or incorrect. \nResponse correctness analysis/prediction is straightforward, but it ignores important information regarding mastery, especially for open-ended questions.\nIn contrast, exact student responses can provide much more information.\nIn this paper, we conduct the first exploration int open-ended knowledge tracing (OKT) by studying the new task of predicting students' exact open-ended responses to questions.\nOur work is grounded in the domain of computer science education with programming questions. \nWe develop an initial solution to the OKT problem, a student knowledge-guided code generation approach, that combines program synthesis methods using language models with student knowledge tracing methods. \nWe also conduct a series of quantitative and qualitative experiments on a real-world student code dataset to validate and demonstrate the promise of OKT.", "track": "NLP Applications", "label": 0}, {"loc": [0.9639930725097656, 6.660263538360596], "id": 704, "title": "Logical Neural Networks for Knowledge Base Completion with Embeddings & Rules", "authors": "Prithviraj Sen, Breno William Carvalho, Ibrahim Abdelaziz, Pavan Kapanipathi, Salim Roukos and Alexander Gray", "abstract": "Knowledge base completion (KBC) has benefitted greatly by learning explainable rules in an human-interpretable dialect such as first-order logic. Rule-based KBC has so far, mainly focussed on learning one of two types of rules: conjunction-of-disjunctions and disjunction-of-conjunctions. We qualitatively show, via examples, that one of these has an advantage over the other when it comes to achieving high quality KBC. To the best of our knowledge, we are the first to propose learning both kinds of rules within a common framework. To this end, we propose to utilize logical neural networks (LNN), a powerful neuro-symbolic AI framework that can express both kinds of rules and learn these end-to-end using gradient-based optimization. Our in-depth experiments show that our LNN-based approach to learning rules for KBC leads to roughly 10% relative improvements, if not more, over SotA rule-based KBC methods. Moreover, by showing how to combine our proposed methods with knowledge graph embeddings we further achieve an additional 7.5% relative improvement.", "track": "Information Extraction", "label": 5}, {"loc": [5.597498416900635, 12.144219398498535], "id": 707, "title": "MedCLIP: Contrastive Learning from Unpaired Medical Images and Text", "authors": "Zifeng Wang, Zhenbang Wu, Dinesh Agarwal and Jimeng Sun", "abstract": "Existing vision-text contrastive learning like CLIP aims to match the paired image and caption embeddings while pushing others apart, which improves representation transferability and supports zero-shot prediction. However, medical image-text datasets are orders of magnitude below the general images and captions from the internet. Moreover, previous methods encounter many false negatives, i.e., images and reports from separate patients probably carry the same semantics but are wrongly treated as negatives. In this paper, we decouple images and texts for multimodal contrastive learning, thus scaling the usable training data in a combinatorial magnitude with low cost. We also propose to replace the InfoNCE loss with semantic matching loss based on medical knowledge to eliminate false negatives in contrastive learning. We prove that MedCLIP is a simple yet effective framework: it outperforms state-of-the-art methods on zero-shot prediction, supervised classification, and image-text retrieval. Surprisingly, we observe that with only 20K pre-training data, MedCLIP wins over the state-of-the-art method (using 200K data). The code is available at https://github.com/RyanWangZf/MedCLIP.", "track": "NLP Applications", "label": 0}, {"loc": [8.752326965332031, 8.1732759475708], "id": 731, "title": "GA-SAM: Gradient-Strength based Adaptive Sharpness-Aware Minimization for Improved Generalization", "authors": "Zhiyuan Zhang, Ruixuan Luo, Qi Su and Xu Sun", "abstract": "Recently, Sharpness-Aware Minimization (SAM) algorithm has shown state-of-the-art generalization abilities in vision tasks. It demonstrates that flat minima tend to imply better generalization abilities. However, it has some difficulty implying SAM to some natural language tasks, especially to models with drastic gradient changes, such as RNNs. In this work, we analyze the relation between the flatness of the local minimum and its generalization ability from a novel and straightforward theoretical perspective. We propose that the shift of the training and test distributions can be equivalently seen as a virtual parameter corruption or perturbation, which can explain why flat minima that are robust against parameter corruptions or perturbations have better generalization performances. On its basis, we propose a Gradient-Strength based Adaptive Sharpness-Aware Minimization (GA-SAM) algorithm to help to learn algorithms find flat minima that generalize better. Results in various language benchmarks validate the effectiveness of the proposed GA-SAM algorithm on natural language tasks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.719780921936035, 7.858575820922852], "id": 732, "title": "Sparse Teachers Can Be Dense with Knowledge", "authors": "Yi Yang, Chen Zhang and Dawei Song", "abstract": "Recent advances in distilling pretrained language models have discovered that, besides the expressiveness of knowledge, the student-friendliness should be taken into consideration to realize a truly knowledgeable teacher. Based on a pilot study, we find that over-parameterized teachers can produce expressive yet student-unfriendly knowledge and are thus limited in overall knowledgeableness. To remove the parameters that result in student-unfriendliness, we propose a sparse teacher trick under the guidance of an overall knowledgeable score for each teacher parameter. The knowledgeable score is essentially an interpolation of the expressiveness and student-friendliness scores. The aim is to ensure that the expressive parameters are retained while the student-unfriendly ones are removed. Extensive experiments on the GLUE benchmark show that the proposed sparse teachers can be dense with knowledge and lead to students with compelling performance in comparison with a series of competitive baselines.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.194052696228027, 8.937093734741211], "id": 739, "title": "BBTv2: Towards a Gradient-Free Future with Large Language Models", "authors": "Tianxiang Sun, Zhengfu He, Hong Qian, Yunhua Zhou, Xuanjing Huang and Xipeng Qiu", "abstract": "Most downstream adaptation methods tune all or part of the parameters of pre-trained models (PTMs) through gradient descent, where the tuning cost increases linearly with the growth of the model size.\nBy contrast, gradient-free methods only require the forward computation of the PTM to tune the prompt, retaining the benefits of efficient tuning and deployment.\nThough, past work on gradient-free tuning often introduces gradient descent to seek a good initialization of prompt and lacks versatility across tasks and PTMs.\nIn this paper, we present BBTv2, an improved version of Black-Box Tuning, to drive PTMs for few-shot learning.\nWe prepend continuous prompts to every layer of the PTM and propose a divide-and-conquer gradient-free algorithm to optimize the prompts at different layers alternately.\nExtensive experiments across various tasks and PTMs show that BBTv2 can achieve comparable performance to full model tuning and state-of-the-art parameter-efficient methods (e.g., Adapter, LoRA, BitFit, etc.) under few-shot settings while maintaining much fewer tunable parameters.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [2.4545364379882812, 4.608763217926025], "id": 746, "title": "Passage-Mask: A Learnable Regularization Strategy for Retriever-Reader Models", "authors": "Shujian Zhang, Chengyue Gong and Xingchao Liu", "abstract": "Retriever-reader models achieve competitive performance across many different NLP tasks such as open question answering and dialogue conversations. In this work, we notice these models easily overfit the top-rank retrieval passages and standard training fails to reason over the entire retrieval passages. We introduce a learnable passage mask mechanism which desensitizes the impact from the top-rank retrieval passages and prevents the model from overfitting. Controlling the gradient variance with fewer mask candidates and selecting the mask candidates with one-shot bi-level optimization, our learnable regularization strategy enforces the answer generation to focus on the entire retrieval passages. Experiments on different tasks across open question answering, dialogue conversation, and fact verification show that our method consistently outperforms its baselines. Extensive experiments and ablation studies demonstrate that our method can be general, effective, and beneficial for many NLP tasks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.391736030578613, 8.031400680541992], "id": 752, "title": "Mixed-effects transformers for hierarchical adaptation", "authors": "Julia White, Noah Goodman and Robert Hawkins", "abstract": "Language differs dramatically from context to context. To some degree, large language models like GPT-3 account for such variation by conditioning on strings of initial input text, or prompts. However, prompting can be ineffective when contexts are sparse, out-of-sample, or extra-textual. In this paper, we introduce the mixed-effects transformer (MET), a novel approach for learning hierarchically-structured prefixes--- lightweight modules prepended to an input sequence--- to account for structured variation in language use. Specifically, we show how the popular class of mixed-effects regression models may be extended to transformer-based architectures using a regularized prefix-tuning procedure with dropout. We evaluate this approach on several domain-adaptation benchmarks, finding that it learns contextual variation from minimal data while generalizing well to unseen contexts.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.009950637817383, 9.670308113098145], "id": 756, "title": "On Measuring the Intrinsic Few-Shot Hardness of Datasets", "authors": "Xinran Zhao, Shikhar Murty and Christopher D. Manning", "abstract": "While advances in pre-training have led to dramatic improvements in few-shot learning of NLP tasks, there is limited understanding of what drives successful few-shot adaptation in datasets. In particular, given a new dataset and a pre-trained model, what properties of the dataset make it few-shot learnable, and are these properties independent of the specific adaptation techniques used? We consider an extensive set of recent few-shot learning methods and show that their performance across a large number of datasets is highly correlated, showing that few-shot hardness may be intrinsic to datasets, for a given pre-trained model. To estimate intrinsic few-shot hardness, we then propose a simple and lightweight metric called Spread that captures the intuition that few-shot learning is made possible by exploiting feature-space invariances between training and test samples. Our metric better accounts for few-shot hardness compared to existing notions of hardness and is ~8-100x faster to compute.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.989253520965576, 9.617755889892578], "id": 757, "title": "Group is better than individual: Exploiting Label Topologies and Label Relations for Joint Multiple Intent Detection and Slot Filling", "authors": "Bowen Xing and Ivor Tsang", "abstract": "Recent joint multiple intent detection and slot filling models employ label embeddings to achieve the semantics-label interactions.\nHowever, they treat all labels and label embeddings as uncorrelated individuals, ignoring the dependencies among them. Besides, they conduct the decoding for the two tasks independently, without leveraging the correlations between them.\nTherefore, in this paper, we first construct a Heterogeneous Label Graph (HLG) containing two kinds of topologies: (1) statistical dependencies based on labels' co-occurrence patterns and hierarchies in slot labels; (2) rich relations among the label nodes.\nThen we propose a novel model termed ReLa-Net.\nIt can capture beneficial correlations among the labels from HLG.\nThe label correlations are leveraged to enhance semantic-label interactions. Moreover, we also propose the label-aware inter-dependent decoding mechanism to further exploit the label correlations for decoding. \nExperiment results show that our ReLa-Net significantly outperforms previous models.\nRemarkably, ReLa-Net surpasses the previous best model by over 20\\% in terms of overall accuracy on MixATIS dataset.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.886317253112793, 9.211671829223633], "id": 765, "title": "An Empirical Study on Finding Spans", "authors": "Weiwei Gu, Boyuan Zheng, Yunmo Chen, Tongfei Chen and Benjamin Van Durme", "abstract": "We present an empirical study on methods for span finding, the selection of consecutive tokens in text for some downstream tasks. We focus on approaches that can be employed in training end-to-end information extraction systems, and find there is no definitive solution without considering task properties, and provide our observations to help with future design choices: 1) a tagging approach often yields higher precision while span enumeration and boundary prediction provide higher recall; 2) span type information can benefit a boundary prediction approach; 3) additional contextualization does not help span finding in most cases.", "track": "Information Extraction", "label": 5}, {"loc": [5.745916843414307, 11.904414176940918], "id": 767, "title": "MGDoc: Pre-training with Multi-granular Hierarchy for Document Image Understanding", "authors": "Zilong Wang, Jiuxiang Gu, Chris Tensmeyer, Nikolaos Barmpalios, Ani Nenkova, Tong Sun, Jingbo Shang and Vlad I. Morariu", "abstract": "Document images are a ubiquitous source of data where the text is organized in a complex hierarchical structure ranging from fine granularity (e.g., words), medium granularity (e.g., regions such as paragraphs or figures), to coarse granularity (e.g., the whole page). The spatial hierarchical relationships between content at different levels of granularity are crucial for document image understanding tasks. Existing methods learn features from either word-level or region-level but fail to consider both simultaneously. Word-level models are restricted by the fact that they originate from pure-text language models, which only encode the word-level context. In contrast, region-level models attempt to encode regions corresponding to paragraphs or text blocks into a single embedding, but they perform worse with additional word-level features. To deal with these issues, we propose MGDoc, a new multi-modal multi-granular pre-training framework that encodes page-level, region-level, and word-level information at the same time. MGDoc uses a unified text-visual encoder to obtain multi-modal features across different granularities, which makes it possible to project the multi-granular features into the same hyperspace. To model the region-word correlation, we design a cross-granular attention mechanism and specific pre-training tasks for our model to reinforce the model of learning the hierarchy between regions and words. Experiments demonstrate that our proposed model can learn better features that perform well across granularities and lead to improvements in downstream tasks.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.359671115875244, 8.668365478515625], "id": 774, "title": "Understanding Jargon: Combining Extraction and Generation for Definition Modeling", "authors": "Jie Huang, Hanyin Shao, Kevin Chang, Jinjun Xiong and Wen-mei Hwu", "abstract": "Can machines know what twin prime is? From the composition of this phrase, machines may guess twin prime is a certain kind of prime, but it is still difficult to deduce exactly what twin stands for without additional knowledge. Here, twin prime is a jargon - a specialized term used by experts in a particular field. Explaining jargon is challenging since it usually requires domain knowledge to understand. Recently, there is an increasing interest in extracting and generating definitions of words automatically. However, existing approaches, either extraction or generation, perform poorly on jargon. In this paper, we propose to combine extraction and generation for jargon definition modeling: first extract self- and correlative definitional information of target jargon from the Web and then generate the final definitions by incorporating the extracted definitional information. Our framework is remarkably simple but effective: experiments demonstrate our method can generate high-quality definitions for jargon and outperform state-of-the-art models significantly, e.g., BLEU score from 8.76 to 22.66 and human-annotated score from 2.34 to 4.04.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.707064151763916, 7.065857887268066], "id": 775, "title": "ProsocialDialog: A Prosocial Backbone for Conversational Agents", "authors": "Hyunwoo Kim, Youngjae Yu, Liwei Jiang, Ximing Lu, Daniel Khashabi, Gunhee Kim, Yejin Choi and Maarten Sap", "abstract": "Most existing dialogue systems fail to respond properly to potentially unsafe user utterances by either ignoring or passively agreeing with them. To address this issue, we introduce ProsocialDialog, the first large-scale multi-turn dialogue dataset to teach conversational agents to respond to problematic content following social norms. Covering diverse unethical, problematic, biased, and toxic situations, ProsocialDialog contains responses that encourage prosocial behavior, grounded in commonsense social rules (i.e., rules-of-thumb, RoTs). Created via a human-AI collaborative framework, ProsocialDialog consists of 58K dialogues, with 331K utterances, 160K unique RoTs, and 497K dialogue safety labels accompanied by free-form rationales.\n\nWith this dataset, we introduce a dialogue safety detection module, Canary, capable of generating RoTs given conversational context, and a socially-informed dialogue agent, Prost. Empirical results show that Prost generates more socially acceptable dialogues compared to other state-of-the-art language and dialogue models in both in-domain and out-of-domain settings. Additionally, Canary effectively guides conversational agents and off-the-shelf language models to generate significantly more prosocial responses. Our work highlights the promise and importance of creating and steering conversational AI to be socially responsible.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.250244617462158, 9.73892593383789], "id": 779, "title": "Exploiting Global and Local Hierarchies for Hierarchical Text Classification", "authors": "Ting Jiang, Deqing Wang, Leilei Sun, Zhongzhi Chen, Fuzhen Zhuang and Qinghong Yang", "abstract": "Hierarchical text classification aims to leverage label hierarchy in multi-label text classification. Existing methods encode label hierarchy in a global view, where label hierarchy is treated as the static hierarchical structure containing all labels. Since global hierarchy is static and irrelevant to text samples, it makes these methods hard to exploit hierarchical information. Contrary to global hierarchy, local hierarchy as a structured labels hierarchy corresponding to each text sample. It is dynamic and relevant to text samples, which is ignored in previous methods. To exploit global and local hierarchies, we propose Hierarchy-guided BERT with Global and Local hierarchies (HBGL), which utilizes the large-scale parameters and prior language knowledge of BERT to model both global and local hierarchies. Moreover, HBGL avoids the intentional fusion of semantic and hierarchical modules by directly modeling semantic and hierarchical information with BERT. Compared with the state-of-the-art method HGCLR, our method achieves significant improvement on three benchmark datasets.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.304051876068115, 6.875697612762451], "id": 782, "title": "Semantic-aware Contrastive Learning for More Accurate Semantic Parsing", "authors": "Shan Wu, Chunlei Xin, Bo Chen, Xianpei Han and Le Sun", "abstract": "Since the meaning representations are detailed and accurate annotations which express fine-grained sequence-level semtantics, it is usually hard to train discriminative semantic parsers via Maximum Likelihood Estimation (MLE) in an autoregressive fashion. In this paper, we propose a semantic-aware contrastive learning algorithm, which can learn to distinguish fine-grained meaning representations and take the overall sequence-level semantic into consideration. Specifically, a multi-level online sampling algorithm is proposed to sample confusing and diverse instances. Three semantic-aware similarity functions are designed to accurately measure the distance between meaning representations as a whole. And a ranked contrastive loss is proposed to pull the representations of the semantic-identical instances together and push negative instances away. Experiments on two standard datasets show that our approach achieves significant improvements over MLE baselines and gets state-of-the-art performances by simply applying semantic-aware contrastive learning on a vanilla Seq2Seq model.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.551393747329712, 9.897439956665039], "id": 785, "title": "Scientific Paper Extractive Summarization Enhanced by Citation Graphs", "authors": "Xiuying Chen, Mingzhe Li, Shen Gao, Rui Yan, Xin Gao and Xiangliang Zhang", "abstract": "In a citation graph, adjacent paper nodes share related scientific terms and topics. \nThe graph thus conveys unique structure information of document-level relatedness that can be utilized in the paper summarization task, for exploring beyond the intra-document information.\nIn this work, we focus on leveraging citation graphs to improve scientific paper extractive summarization under different settings.\nWe first propose a Multi-granularity Unsupervised Summarization model (MUS) as a simple and low-cost solution to the task.\nMUS finetunes a pre-trained encoder model on the citation graph by link prediction tasks.\nThen, the abstract sentences are extracted from the corresponding paper considering multi-granularity information.\nPreliminary results demonstrate that citation graph is helpful even in a simple unsupervised framework.\nMotivated by this, we next propose a Graph-based Supervised Summarizationmodel (GSS) to achieve more accurate results on the task when large-scale labeled data are available.\nApart from employing the link prediction as an auxiliary task, GSS introduces a gated sentence encoder and a graph information fusion module to take advantage of the graph information to polish the sentence representation.\nExperiments on a public benchmark dataset show that MUS and GSS bring substantial improvements over the prior state-of-the-art model.", "track": "Summarization", "label": 14}, {"loc": [2.3120298385620117, 8.654574394226074], "id": 802, "title": "Hardness-guided domain adaptation to recognise biomedical named entities under low-resource scenarios", "authors": "Ngoc Dang Nguyen, Lan Du, Wray Buntine, Changyou Chen and Richard Beare", "abstract": "Domain adaptation is an effective solution to data scarcity in low-resource scenarios. However, when applied to token-level tasks such as bioNER, domain adaptation methods often suffer from the challenging linguistic characteristics that clinical narratives possess, which leads to unsatsifactory performance. In this paper, we present a simple yet effective hardness-guided domain adaptation framework for bioNER tasks that can effectively leverage the domain hardness information to improve the adaptability of the learnt model in the low-resource scenarios. Experimental results on biomedical datasets show that our model can achieve significant performance improvement over the recently published state-of-the-art (SOTA) MetaNER model.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [0.963117778301239, 7.885301113128662], "id": 805, "title": "Syntactic Multi-view Learning for Open Information Extraction", "authors": "Kuicai Dong, Aixin Sun, Jung-Jae Kim and Xiaoli Li", "abstract": "Open Information Extraction (OpenIE) aims to extract relational tuples from open-domain sentences. \nTraditional rule-based or statistical models were developed based on syntactic structure of sentence, identified by syntactic parsers. However, previous neural OpenIE models under-explored the useful syntactic information. In this paper, we model both constituency and dependency trees into word-level graphs, and enable neural OpenIE to learn from the syntactic structures. To better fuse heterogeneous information from the two graphs, we adopt multi-view learning to capture multiple relationships from them. Finally, the finetuned constituency and dependency representations are aggregated with sentential semantic representations for tuple generation. Experiments show that both constituency and dependency information, and the multi-view learning are effective.", "track": "Information Extraction", "label": 5}, {"loc": [5.617773532867432, 11.984402656555176], "id": 817, "title": "TRIPS: Efficient Vision-and-Language Pre-training with Text-Relevant Image Patch Selection", "authors": "Chaoya Jiang, Haiyang Xu, Chenliang Li, Ming Yan, Wei Ye, Shikun Zhang, Bin Bi and Songfang Huang", "abstract": "Vision Transformers (ViTs) have been widely used in large-scale Vision and Language Pre-training (VLP) models. Though previous VLP works have proved the effectiveness of ViTs, they still suffer from computational efficiency brought by the long visual sequence. To tackle this problem, in this paper, we propose an efficient vision-and-language pre-training model with Text-Relevant Image Patch Selection, namely TRIPS, which reduces the visual sequence progressively with a text-guided patch-selection layer in the visual backbone for efficient training and inference. The patch-selection layer can dynamically compute text-dependent visual attention to identify the attentive image tokens with text guidance and fuse inattentive ones in an end-to-end manner. Meanwhile, TRIPS does not introduce extra parameters to ViTs. Experimental results on a variety of popular benchmark datasets demonstrate that TRIPS gain a speedup of 40% over previous similar VLP models, yet with competitive or better downstream task performance.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.411097526550293, 7.556691646575928], "id": 821, "title": "CGoDial: A Large-Scale Benchmark for Chinese Goal-oriented Dialog Evaluation", "authors": "Yinpei Dai, Wanwei He, Bowen Li, Yuchuan Wu, Zheng Cao, Zhongqi An, Jian Sun and Yongbin Li", "abstract": "Practical dialog systems need to deal with various knowledge sources, noisy user expressions, and the shortage of annotated data. To better solve the above problems, we propose CGoDial, a new challenging and comprehensive Chinese benchmark for multi-domain Goal-oriented Dialog evaluation. It contains 96,763 dialog sessions, and 574,949 dialog turns totally, covering three datasets with different knowledge sources: 1) a slot-based dialog (SBD) dataset with table-formed knowledge, 2) a flow-based dialog (FBD) dataset with tree-formed knowledge, and a retrieval-based dialog (RBD) dataset with candidate-formed knowledge. To bridge the gap between academic benchmarks and spoken dialog scenarios, we either collect data from real conversations or add spoken features to existing datasets via crowd-sourcing. The proposed experimental settings include the combinations of training with either the entire training set or a few-shot training set, and testing with either the standard test set or a hard test subset, which can assess model capabilities in terms of general prediction, fast adaptability and reliable robustness.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [6.27471923828125, 5.334392547607422], "id": 836, "title": "Kernel-Whitening: Overcome Dataset Bias with Isotropic Sentence Embedding", "authors": "SongYang Gao, Shihan Dou, Qi Zhang and Xuanjing Huang", "abstract": "Dataset bias has attracted increasing attention recently for its detrimental effect on the generalization ability of fine-tuned models. The current mainstream solution is designing an additional shallow model to pre-identify biased instances. However, such two-stage methods scale up the computational complexity of training process and obstruct valid feature information while mitigating bias.\nTo address this issue, we utilize the representation normalization method which aims at disentangling the correlations between features of encoded sentences. We find it also promising in eliminating the bias problem by providing isotropic data distribution. We further propose Kernel-Whitening, a Nystrom kernel approximation method to achieve more thorough debiasing on nonlinear spurious correlations. Our framework is end-to-end with similar time consumption to fine-tuning. Experiments show that Kernel-Whitening significantly improves the performance of BERT on out-of-distribution datasets while maintaining in-distribution accuracy.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [0.7796629667282104, 8.079690933227539], "id": 839, "title": "A Unified Positive-Unlabeled Learning Framework for Document-Level Relation Extraction with Different Levels of Labeling", "authors": "Ye Wang, Xinxin Liu, Wenxin Hu and Tao Zhang", "abstract": "Document-level relation extraction (RE) aims to identify relations between entities across multiple sentences. Most previous methods focused on document-level RE under full supervision. However, in real-world scenario, it is expensive and difficult to completely label all relations in a document because the number of entity pairs in document-level RE grows quadratically with the number of entities. To solve the common incomplete labeling problem, we propose a unified positive-unlabeled learning framework - shift and squared ranking loss positive-unlabeled (SSR-PU) learning. We use positive-unlabeled (PU) learning on document-level RE for the first time. Considering that labeled data of a dataset may lead to prior shift of unlabeled data, we introduce a PU learning under prior shift of training data. Also, using none-class score as an adaptive threshold, we propose squared ranking loss and prove its Bayesian consistency with multi-label ranking metrics. Extensive experiments demonstrate that our method achieves an improvement of about 14 F1 points relative to the previous baseline with incomplete labeling. In addition, it outperforms previous state-of-the-art results under both fully supervised and extremely unlabeled settings as well.", "track": "Information Extraction", "label": 5}, {"loc": [3.5793776512145996, 4.462541103363037], "id": 866, "title": "Automatic Generation of Socratic Subquestions for Teaching Math Word Problems", "authors": "Kumar Shridhar, Jakub Macina, Mennatallah El-Assady, Tanmay Sinha, Manu Kapur and Mrinmaya Sachan", "abstract": "Socratic questioning is an educational method that allows students to discover answers to complex problems by asking them a series of thoughtful questions. \nGeneration of didactically sound questions is challenging, requiring understanding of the reasoning process involved in the problem. We hypothesize that such questioning strategy can not only enhance the human performance, but also assist the math word problem (MWP) solvers.\nIn this work, we explore the ability of large language models (LMs) in generating sequential questions for guiding math word problem-solving. We propose various guided question generation schemes based on input conditioning and reinforcement learning.\nOn both automatic and human quality evaluations, we find that LMs constrained with desirable question properties generate superior questions and improve the overall performance of a math word problem solver. We conduct a preliminary user study to examine the potential value of such question generation models in the education domain. Results suggest that the difficulty level of problems plays an important role in determining whether questioning improves or hinders human performance. \nWe discuss the future of using such questioning strategies in education.", "track": "NLP Applications", "label": 0}, {"loc": [8.77017593383789, 7.868949890136719], "id": 872, "title": "Mixture of Attention Heads: Selecting Attention Heads Per Token", "authors": "Xiaofeng Zhang, Yikang Shen, Zeyu Huang, Jie Zhou, Wenge Rong and Zhang Xiong", "abstract": "Mixture-of-Experts (MoE) networks have been proposed as an efficient way to scale up model capacity and implement conditional computing. However, the study of MoE components mostly focused on the feedforward layer in Transformer architecture. This paper proposes the Mixture of Attention Heads (MoA), a new architecture that combines multi-head attention with the MoE mechanism. MoA includes a set of attention heads that each has its own set of parameters. Given an input, a router dynamically selects a subset of k attention heads per token. This conditional computation schema allows MoA to achieve stronger performance than the standard multi-head attention layer. Furthermore, the sparsely gated MoA can easily scale up the number of attention heads and the number of parameters while preserving computational efficiency. Despite performance improvements, MoA also automatically differentiates heads' utilities, providing a new perspective to discuss the model's interpretability. We conducted experiments on several important tasks, including Machine Translation and Masked Language Modeling. Experiments have shown promising results on several tasks against strong baselines that involve large and very deep models.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.782752990722656, 8.166292190551758], "id": 875, "title": "The Optimal BERT Surgeon: Scalable and Accurate Second-Order Pruning for Large Language Models", "authors": "Eldar Kurtic, Daniel Campos, Tuan A. Nguyen, Elias Frantar, Mark John Kurtz, Benjamin Fineran, Michael J. Goin and Dan Alistarh", "abstract": "In this paper, we consider the problem of sparsifying BERT models, which are a key building block for natural language processing, in order to reduce their storage and computational cost. We introduce the Optimal BERT Surgeon (oBERT), an efficient and accurate pruning method based on approximate second-order information, which we show to yield state-of-the-art results in both stages of language tasks: pre-training and fine-tuning. Specifically, oBERT extends existing work on second-order pruning by allowing for pruning weight blocks, and is the first such method that is applicable at BERT scale. \nSecond, we investigate compounding compression approaches to obtain highly compressed but accurate models for deployment on edge devices. These models significantly push boundaries of the current state-of-the-art sparse BERT models with respect to all metrics: model size, inference speed and task accuracy. For example, relative to the dense BERT-base, we obtain 10x model size compression with < 1% accuracy drop, 10x CPU-inference speedup with < 2% accuracy drop, and 29x CPU-inference speedup with < 7.5% accuracy drop. Our code, fully integrated with Transformers and SparseML, is available at https://github.com/neuralmagic/sparseml/tree/main/research/optimal_BERT_surgeon_oBERT.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [5.262276649475098, 11.673805236816406], "id": 876, "title": "Information-Theoretic Text Hallucination Reduction for Video-grounded Dialogue", "authors": "Sunjae Yoon, Eunseop Yoon, Hee Suk Yoon, Junyeong Kim and Chang Yoo", "abstract": "Video-grounded Dialogue (VGD) aims to decode an answer sentence to a question regarding a given video and dialogue context. Despite the recent success of multi-modal reasoning to generate answer sentences, existing dialogue systems still suffer from a text hallucination problem, which denotes indiscriminate text-copying from input texts without an understanding of the question. This is due to learning spurious correlations from the fact that answer sentences in the dataset usually include the words of input texts, thus the VGD system excessively relies on copying words from input texts by hoping those words to overlap with ground-truth texts. Hence, we design Text Hallucination Mitigating (THAM) framework, which incorporates Text Hallucination Regularization (THR) loss derived from the proposed information-theoretic text hallucination measurement approach. Applying THAM with current dialogue systems validates the effectiveness on VGD benchmarks (i.e., AVSD@DSTC7 and AVSD@DSTC8) and shows enhanced interpretability.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.7259498834609985, 5.424082279205322], "id": 887, "title": "DSM: Question Generation over Knowledge Base via Modeling Diverse Subgraphs with Meta-learner", "authors": "Shasha Guo, Jing Zhang, Yanling Wang, Qianyi Zhang, Cuiping Li and Hong Chen", "abstract": "Existing methods on knowledge base question generation (KBQG) learn a one-size-fits-all model by training together all subgraphs without distinguishing the diverse semantics of subgraphs. In this work, we show that making use of the past experience on semantically similar subgraphs can reduce the learning difficulty and promote the performance of KBQG models. To achieve this, we propose a novel approach to model diverse subgraphs with meta-learner (DSM). Specifically, we devise a graph contrastive learning-based retriever to identify semantically similar subgraphs, so that we can construct the semantics-aware learning tasks for the meta-learner to learn semantics-specific and semantics-agnostic knowledge on and across these tasks. Extensive experiments on two widely-adopted benchmarks for KBQG show that DSM derives new state-of-the-art performance and benefits the question answering tasks as a means of data augmentation.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.5868709683418274, 7.674073696136475], "id": 894, "title": "RelU-Net: Syntax-aware Graph U-Net for Relational Triple Extraction", "authors": "Yunqi Zhang, Yubo Chen and Yongfeng Huang", "abstract": "Relational triple extraction is a critical task for natural language processing. Existing methods mainly focused on capturing semantic information, but suffered from ignoring the syntactic structure of the sentence, which is proved in the relation classification task to contain rich relational information. This is due to the absence of entity locations, which is the prerequisite for pruning noisy edges from the dependency tree, when extracting relational triples. In this paper, we propose a unified framework to tackle this challenge and incorporate syntactic information for relational triple extraction. First, we propose to automatically contract the dependency tree into a core relational topology and eliminate redundant information with graph pooling operations. Then, we propose a symmetrical expanding path with graph unpooling operations to fuse the contracted core syntactic interactions with the original sentence context. We also propose a bipartite graph matching objective function to capture the reflections between the core topology and golden relational facts. Since our model shares similar contracting and expanding paths with encoder-decoder models like U-Net, we name our model as Relation U-Net (RelU-Net). We conduct experiments on several datasets and the results prove the effectiveness of our method.", "track": "Information Extraction", "label": 5}, {"loc": [6.58177375793457, 8.390844345092773], "id": 895, "title": "Evidence > Intuition: Transferability Estimation for Encoder Selection", "authors": "Elisa Bassignana, Max M\u00fcller-Eberstein, Mike Zhang and Barbara Plank", "abstract": "With the increase in availability of large pre-trained language models (LMs) in Natural Language Processing (NLP), it becomes critical to assess their fit for a specific target task a priori\u2014as fine-tuning the entire space of available LMs is computationally prohibitive and unsustainable. However, encoder transferability estimation has received little to no attention in NLP. In this paper, we propose to generate quantitative evidence to predict which LM, out of a pool of models, will perform best on a target task without having to fine-tune all candidates. We provide a comprehensive study on LM ranking for 10 NLP tasks spanning the two fundamental problem types of classification and structured prediction. We adopt the state-of-the-art Logarithm of Maximum Evidence (LogME) measure from Computer Vision (CV) and find that \nit positively correlates with final LM performance in 94% of the setups.\nIn the first study of its kind, we further compare transferability measures with the de facto standard of human practitioner ranking, finding that evidence from quantitative metrics is more robust than pure intuition and can help identify unexpected LM candidates.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.416794776916504, 7.138895034790039], "id": 898, "title": "Chunk-based Nearest Neighbor Machine Translation", "authors": "Pedro Henrique Martins, Zita Marinho and Andr\u00e9 F. T. Martins", "abstract": "Semi-parametric models, which augment generation with retrieval, have led to impressive results in language modeling and machine translation, due to their ability to retrieve fine-grained information from a datastore of examples. One of the most prominent approaches, kNN-MT, exhibits strong domain adaptation capabilities by retrieving tokens from domain-specific datastores (Khandelwal et al., 2021). However, kNN-MT requires an expensive retrieval operation for every single generated token, leading to a very low decoding speed (around 8 times slower than a parametric model). In this paper, we introduce a chunk-based kNN-MT model which retrieves chunks of tokens from the datastore, instead of a single token. We propose several strategies for incorporating the retrieved chunks into the generation process, and for selecting the steps at which the model needs to search for neighbors in the datastore. Experiments on machine translation in two settings, static and \"on-the-fly\u201d domain adaptation, show that the chunk-based kNN-MT model leads to significant speed-ups (up to 4 times) with only a small drop in translation quality.", "track": "Machine Translation", "label": 10}, {"loc": [2.850724458694458, 4.680685997009277], "id": 913, "title": "FiE: Building a Global Probability Space by Leveraging Early Fusion in Encoder for Open-Domain Question Answering", "authors": "Akhil Kedia, Mohd Abbas Zaidi and Haejun Lee", "abstract": "Generative models have recently started to outperform extractive models in Open Domain Question Answering, largely by leveraging their decoder to attend over multiple encoded passages and combining their information. However, generative models tend to be larger than extractive models due to the need for a decoder, run slower during inference due to auto-regressive decoder beam search, and their generated output often suffers from hallucinations. We propose to extend transformer encoders with the ability to fuse information from multiple passages, using global representation to provide cross-sample attention over all tokens across samples. Furthermore, we propose an alternative answer span probability calculation to better aggregate answer scores in the global space of all samples. Using our proposed method, we outperform the current state-of-the-art method by 2.5 Exact Match score on the Natural Question dataset while using only 25% of parameters and 35% of the latency during inference, and 4.4 Exact Match on WebQuestions dataset. When coupled with synthetic data augmentation, we outperform larger models on the TriviaQA dataset as well. The latency and parameter savings of our method make it particularly attractive for open-domain question answering, as these models are often compute-intensive.", "track": "Question Answering", "label": 11}, {"loc": [0.63530033826828, 7.650243759155273], "id": 916, "title": "Inductive Relation Prediction with Logical Reasoning Using Contrastive Representations", "authors": "Yudai Pan, Jun Liu, Lingling Zhang, Tianzhe Zhao, Qika Lin, Xin Hu and Qianying Wang", "abstract": "Relation prediction in knowledge graphs (KGs) aims at predicting missing relations in incomplete triples, whereas the dominant embedding paradigm has a restriction on handling unseen entities during testing. In the real-world scenario, the inductive setting is more common because entities in the training process are finite. Previous methods capture an inductive ability by implicit logic in KGs. However, it would be challenging to preciously acquire entity-independent relational semantics of compositional logic rules and to deal with the deficient supervision of logic caused by the scarcity of relational semantics. To this end, we propose a novel graph convolutional network (GCN)-based model LogCo with logical reasoning by contrastive representations. LogCo firstly extracts enclosing subgraphs and relational paths between two entities to supply the entity-independence. Then a contrastive strategy for relational path instances and the subgraph is proposed for the issue of deficient supervision. The contrastive representations are learned for a joint training regime. Finally, prediction results and logic rules for reasoning are attained. Comprehensive experiments on twelve inductive datasets show that LogCo achieves outstanding performance comparing with state-of-the-art inductive relation prediction baselines.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [10.830070495605469, 9.347640991210938], "id": 920, "title": "Improving Chinese Spelling Check by Character Pronunciation Prediction: The Effects of Adaptivity and Granularity", "authors": "Jiahao Li, Quan Wang, Zhendong Mao, Junbo Guo, Yanyan Yang and Yongdong Zhang", "abstract": "Chinese spelling check (CSC) is a fundamental NLP task that detects and corrects spelling errors in Chinese texts. As most of these spelling errors are caused by phonetic similarity, effectively modeling the pronunciation of Chinese characters is a key factor for CSC. In this paper, we consider introducing an auxiliary task of Chinese pronunciation prediction (CPP) to improve CSC, and, for the first time, systematically discuss the adaptivity and granularity of this auxiliary task. We propose SCOPE which builds upon a shared encoder two parallel decoders, one for the primary CSC task and the other for a fine-grained auxiliary CPP task, with a novel adaptive weighting scheme to balance the two tasks. In addition, we design a delicate iterative correction strategy for further improvements during inference. Empirical evaluation shows that SCOPE achieves new state-of-the-art on three CSC benchmarks, demonstrating the effectiveness and superiority of the auxiliary CPP task. Comprehensive ablation studies further verify the positive effects of adaptivity and granularity of the task.", "track": "NLP Applications", "label": 0}, {"loc": [10.91057014465332, 6.819715976715088], "id": 932, "title": "MT-GenEval: A Counterfactual and Contextual Dataset for Evaluating Gender Accuracy in Machine Translation", "authors": "Anna Currey, Maria Nadejde, Raghavendra Reddy Pappagari, Mia Mayer, Stanislas Lauly, Xing Niu, Benjamin Hsu and Georgiana Dinu", "abstract": "As generic machine translation (MT) quality has improved, the need for targeted benchmarks that explore fine-grained aspects of quality has increased. In particular, gender accuracy in translation can have implications in terms of output fluency, translation accuracy, and ethics. In this paper, we introduce MT-GenEval, a benchmark for evaluating gender accuracy in translation from English into eight widely-spoken languages. MT-GenEval complements existing benchmarks by providing realistic, gender-balanced, counterfactual data in eight language pairs where the gender of individuals is unambiguous in the input segment, including multi-sentence segments requiring inter-sentential gender agreement. Our data and code is publicly available under a CC BY SA 3.0 license.", "track": "Machine Translation", "label": 10}, {"loc": [1.010554552078247, 10.48963451385498], "id": 933, "title": "A Span-level Bidirectional Network for Aspect Sentiment Triplet Extraction", "authors": "Yuqi Chen, Chen Keming, Xian Sun and Zequn Zhang", "abstract": "Aspect Sentiment Triplet Extraction (ASTE) is a new fine-grained sentiment analysis task that aims to extract triplets of aspect terms, sentiments, and opinion terms from review sentences. Recently, span-level models achieve gratifying results on ASTE task by taking advantage of the predictions of all possible spans. Since all possible spans significantly increases the number of potential aspect and opinion candidates, it is crucial and challenging to efficiently extract the triplet elements among them. In this paper, we present a span-level bidirectional network which utilizes all possible spans as input and extracts triplets from spans bidirectionally. Specifically, we devise both the aspect decoder and opinion decoder to decode the span representations and extract triples from aspect-to-opinion and opinion-to-aspect directions. With these two decoders complementing with each other, the whole network can extract triplets from spans more comprehensively. Moreover, considering that mutual exclusion cannot be guaranteed between the spans, we design a similar span separation loss to facilitate the downstream task of distinguishing the correct span by expanding the KL divergence of similar spans during the training process; in the inference process, we adopt an inference strategy to remove conflicting triplets from the results base on their confidence scores. Experimental results show that our framework not only significantly outperforms state-of-the-art methods, but achieves better performance in predicting triplets with multi-token entities and extracting triplets in sentences contain multi-triplets.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [9.195700645446777, 6.701645851135254], "id": 937, "title": "On the Calibration of Massively Multilingual Language Models", "authors": "Kabir Ahuja, Sunayana Sitaram, Sandipan Dandapat and Monojit Choudhury", "abstract": "Massively Multilingual Language Models (MMLMs) have recently gained popularity due to their surprising effectiveness in cross-lingual transfer. While there has been much work in evaluating these models for their performance on a variety of tasks and languages, little attention has been paid on how well calibrated these models are with respect to the confidence in their predictions. We first investigate the calibration of MMLMs in the zero-shot setting and observe a clear case of miscalibration in low-resource languages or those which are typologically diverse from English. Next, we empirically show that calibration methods like temperature scaling and label smoothing do reasonably well in improving calibration in the zero-shot scenario. We also find that few-shot examples in the language can further help reduce calibration errors, often substantially. Overall, our work contributes towards building more reliable multilingual models by highlighting the issue of their miscalibration, understanding what language and model-specific factors influence it, and pointing out the strategies to improve the same.", "track": "Multilinguality", "label": 13}, {"loc": [2.83474063873291, 4.647697448730469], "id": 938, "title": "Momentum Contrastive Pre-training for Question Answering", "authors": "Minda Hu, Muzhi Li, Yasheng Wang and Irwin King", "abstract": "Existing pre-training methods for extractive Question Answering (QA) generate cloze-like queries different from natural questions in syntax structure, which could overfit pre-trained models to simple keyword matching. In order to address this problem, we propose a novel Momentum Contrastive pRe-training fOr queStion anSwering (MCROSS) method for extractive QA. Specifically, MCROSS introduces a momentum contrastive learning framework to align the answer probability between cloze-like and natural query-passage sample pairs. Hence, the pre-trained models can better transfer the knowledge learned in cloze-like samples to answering natural questions. Experimental results on three benchmarking QA datasets show that our method achieves noticeable improvement compared with all baselines in both supervised and zero-shot scenarios.", "track": "Question Answering", "label": 11}, {"loc": [8.381734848022461, 6.529812335968018], "id": 946, "title": "A Second Wave of UD Hebrew Treebanking and Cross-Domain Parsing", "authors": "Amir Zeldes, Nick Howell, Noam Ordan and Yifat Ben Moshe", "abstract": "Foundational Hebrew NLP tasks such as segmentation, tagging and parsing, have relied to date on various versions of the Hebrew Treebank (HTB, Sima'an et al. 2001). However, the data in HTB, a single-source newswire corpus, is now over 30 years old, and does not cover many aspects of contemporary Hebrew on the web. This paper presents a new, freely available UD treebank of Hebrew stratified from a range of topics selected from Hebrew Wikipedia. In addition to introducing the corpus and evaluating the quality of its annotations, we deploy automatic validation tools based on grew (Guillaume, 2021), and conduct the first cross domain parsing experiments in Hebrew. We obtain new state-of-the-art (SOTA) results on UD NLP tasks, using a combination of the latest language modelling and some incremental improvements to existing transformer based approaches. We also release a new version of the UD HTB matching annotation scheme updates from our new corpus.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.812136173248291, 5.32790994644165], "id": 960, "title": "Finding Dataset Shortcuts with Grammar Induction", "authors": "Dan Friedman, Alexander Wettig and Danqi Chen", "abstract": "Many NLP datasets have been found to contain shortcuts: simple decision rules that achieve surprisingly high accuracy. However, it is difficult to discover shortcuts automatically. Prior work on automatic shortcut detection has focused on enumerating features like unigrams or bigrams, which can find only low-level shortcuts, or relied on post-hoc model interpretability methods like saliency maps, which reveal qualitative patterns without a clear statistical interpretation. In this work, we propose to use probabilistic grammars to characterize and discover shortcuts in NLP datasets. Specifically, we use a context-free grammar to model patterns in sentence classification datasets and use a synchronous context-free grammar to model datasets involving sentence pairs. The resulting grammars reveal interesting shortcut features in a number of datasets, including both simple and high-level features, and automatically identify groups of test examples on which conventional classifiers fail. Finally, we show that the features we discover can be used to generate diagnostic contrast examples and incorporated into standard robust optimization methods to improve worst-group accuracy.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [4.809964656829834, 3.2975311279296875], "id": 970, "title": "Retrieval Augmentation for Commonsense Reasoning: A Unified Approach", "authors": "Wenhao Yu, Chenguang Zhu, Zhihan Zhang, Shuohang Wang, Zhuosheng Zhang, Yuwei Fang and Meng Jiang", "abstract": "A common thread of retrieval-augmented methods in the existing literature focuses on retrieving encyclopedic knowledge, such as Wikipedia, which facilitates well-defined entity and relation spaces that can be modeled. However, applying such methods to commonsense reasoning tasks faces two unique challenges, i.e., the lack of a general large-scale corpus for retrieval and a corresponding effective commonsense retriever. In this paper, we systematically investigate how to leverage commonsense knowledge retrieval to improve commonsense reasoning tasks. We proposed a unified framework of retrieval-augmented commonsense reasoning (called RACo), including a newly constructed commonsense corpus with over 20 million documents and novel strategies for training a commonsense retriever. We conducted experiments on four different commonsense reasoning tasks. Extensive evaluation results showed that our proposed RACo can significantly outperform other knowledge-enhanced method counterparts, achieving new SoTA performance on the CommonGen and CREAK leaderboards.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [1.473496913909912, 7.988746643066406], "id": 979, "title": "Open World Classification with Adaptive Negative Samples", "authors": "Ke Bai, Guoyin Wang, Jiwei Li, Sunghyun Park, Sungjin Lee, Puyang Xu, Ricardo Henao and Lawrence Carin", "abstract": "Open world classification is a task in natural language processing with key practical relevance and impact.\nSince the open or unknown category data only manifests in the inference phase, finding a model with a suitable decision boundary accommodating for the identification of known classes and discrimination of the open category is challenging.\nThe performance of existing models is limited by the lack of effective open category data during the training stage or the lack of a good mechanism to learn appropriate decision boundaries.\nWe propose an approach based on Adaptive Negative Samples (ANS) designed to generate effective synthetic open category samples in the training stage and without requiring any prior knowledge or external datasets.\nEmpirically, we find a significant advantage in using auxiliary one-versus-rest binary classifiers, which \neffectively utilize the generated negative samples and avoid the complex threshold-seeking stage in previous works.\nExtensive experiments on three benchmark datasets show that ANS achieves significant improvements over state-of-the-art methods.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.61194372177124, 9.15684700012207], "id": 986, "title": "Re3: Generating Longer Stories With Recursive Reprompting and Revision", "authors": "Kevin Yang, Yuandong Tian, Nanyun Peng and Dan Klein", "abstract": "We consider the problem of automatically generating longer stories of over two thousand words. Compared to prior work on shorter stories, long-range plot coherence and relevance are more central challenges here. We propose the Recursive Reprompting and Revision framework (Re3) to address these challenges by (a) prompting a general-purpose language model to construct a structured overarching plan, and (b) generating story passages by repeatedly injecting contextual information from both the plan and current story state into a language model prompt. We then revise by (c) reranking different continuations for plot coherence and premise relevance, and finally (d) editing the best continuation for factual consistency. Compared to similar-length stories generated directly from the same base model, human evaluators judged substantially more of Re3's stories as having a coherent overarching plot (by 14% absolute increase), and relevant to the given initial premise (by 20%).", "track": "Natural Language Generation", "label": 6}, {"loc": [10.236653327941895, 7.6986165046691895], "id": 993, "title": "Does Joint Training Really Help Cascaded Speech Translation?", "authors": "Viet Anh Khoa Tran, David Thulke, Yingbo Gao, Christian Herold and Hermann Ney", "abstract": "Currently, in speech translation, the straightforward approach - cascading a recognition system with a translation system - delivers state-of-the-art results.\nHowever, fundamental challenges such as error propagation from the automatic speech recognition system still remain.\nTo mitigate these problems, recently, people turn their attention to direct data and propose various joint training methods.\nIn this work, we seek to answer the question of whether joint training really helps cascaded speech translation.\nWe review recent papers on the topic and also investigate a joint training criterion by marginalizing the transcription posterior probabilities.\nOur findings show that a strong cascaded baseline can diminish any improvements obtained using joint training, and we suggest alternatives to joint training.\nWe hope this work can serve as a refresher of the current speech translation landscape, and motivate research in finding more efficient and creative ways to utilize the direct data for speech translation.", "track": "Multilinguality", "label": 13}, {"loc": [1.7381771802902222, 9.103643417358398], "id": 999, "title": "MasakhaNER 2.0: Africa-centric Transfer Learning for Named Entity Recognition", "authors": "David Ifeoluwa Adelani, Graham Neubig, Sebastian Ruder, Shruti Rijhwani, Michael Coenraad Beukman, Chester Palen-Michel, Constantine Lignos, Jesujoba Alabi, Shamsuddeen Hassan Muhammad, Peter Nabende, Cheikh M. Bamba Dione, Andiswa Bukula, Rooweither Mabuya, Bonaventure F. P. Dossou, Blessing Sibanda, Happy Buzaaba, Jonathan Mukiibi, Godson K. KALIPE, Derguene Mbaye, Amelia Taylor, Fatoumata Ouoba Kabore, Chris Chinenye Emezue, Anuoluwapo Aremu, Perez Ogayo, Catherine Gitau, Edwin Munkoh-Buabeng, victoire Memdjokam Koagne, Allahsera Auguste Tapo, Tebogo Macucwa, Vukosi Marivate, MBONING TCHIAZE Elvis, Tajuddeen Gwadabe, Tosin Adewumi, Orevaoghene Ahia, Joyce Nakatumba-Nabende, Neo Lerato Mokono, Ignatius Ezeani, Chiamaka Chukwuneke, Mofetoluwa Oluwaseun Adeyemi, Gilles Quentin HACHEME, Idris Abdulmumin, Odunayo Ogundepo, Oreen Yousuf, Tatiana Moteu and Dietrich Klakow", "abstract": "African languages are spoken by over a billion people, but they are under-represented in NLP research and development. Multiple challenges exist, including the limited availability of annotated training and evaluation datasets as well as the lack of understanding of which settings, languages, and recently proposed methods like cross-lingual transfer will be effective. In this paper, we aim to move towards solutions for these challenges, focusing on the task of named entity recognition (NER). We present the creation of the largest to-date human-annotated NER dataset for 20 African languages. We study the behaviour of state-of-the-art cross-lingual transfer methods in an Africa-centric setting, empirically demonstrating that the choice of source transfer language significantly affects performance. While much previous work defaults to using English as the source language, our results show that choosing the best transfer language improves zero-shot F1 scores by an average of 14\\% over 20 languages as compared to using English.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.9321393966674805, 6.760947227478027], "id": 1009, "title": "Ethics consideration sections in natural language processing papers", "authors": "Luciana Benotti and Patrick Blackburn", "abstract": "In this paper, we present the results of a manual classification of all ethical consideration sections for ACL 2021. We also compare how many papers had an ethics consideration section per track and per world region in ACL 2021. We classified papers according to the ethical issues covered (research benefits, potential harms, and vulnerable groups affected) and whether the paper was marked as requiring ethics review by at least one reviewer. Moreover, we discuss recurring obstacles we have observed (highlighting some interesting texts we found along the way) and conclude with three suggestions. We think that this paper may be useful for anyone who needs to write --- or review --- an ethics section and would like to get an overview of what others have done.", "track": "Ethics", "label": 21}, {"loc": [7.981054782867432, 9.063968658447266], "id": 1013, "title": "Continued Pretraining for Better Zero- and Few-Shot Promptability", "authors": "Zhaofeng Wu, Robert L Logan IV, Pete Walsh, Akshita Bhagia, Dirk Groeneveld, Sameer Singh and Iz Beltagy", "abstract": "Recently introduced language model prompting methods can achieve high accuracy in zero- and few-shot settings while requiring few to no learned task-specific parameters. Nevertheless, these methods still often trail behind full model finetuning. In this work, we investigate if a dedicated continued pretraining stage could improve \"promptability\", i.e., zero-shot performance with natural language prompts or few-shot performance with prompt tuning. We reveal settings where existing continued pretraining methods lack promptability. We also identify current methodological gaps, which we fill with thorough large-scale experiments. We demonstrate that a simple recipe, continued pretraining that incorporates a trainable prompt during multi-task learning, leads to improved promptability in both zero- and few-shot settings compared to existing methods, up to 31% relative. On the other hand, we find that continued pretraining using MAML-style meta-learning, a method that directly optimizes few-shot promptability, yields subpar performance. We validate our findings with two prompt tuning methods, and, based on our results, we provide concrete recommendations to optimize promptability for different use cases.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [6.456911087036133, 1.9210976362228394], "id": 1021, "title": "Less is More: Summary of Long Instructions is Better for Program Synthesis", "authors": "Kirby Kuznia, Swaroop Mishra, Mihir Parmar and Chitta Baral", "abstract": "Despite the success of large pre-trained language models (LMs) such as Codex, they show below-par performance on the larger and more complicated programming related questions. We show that LMs benefit from the summarized version of complicated questions. Our findings show that superfluous information often present in problem description such as human characters, background stories, and names (which are included to help humans in understanding a task) does not help models in understanding a task. To this extent, we create a meta-dataset from the frequently used APPS dataset and the newly created CodeContests dataset for the program synthesis task. Our meta-dataset consists of human and synthesized summaries of the long and complicated programming questions. Experimental results on Codex show that our proposed approach outperforms baseline by 8.13% on the APPS dataset and 11.88% on the CodeContests dataset on an average in terms of strict accuracy. Our analysis shows that summaries significantly improve performance for introductory (9.86%) and interview (11.48%) related programming questions. However, it shows improvement by a small margin (~ 2%) for competitive programming questions, implying the scope for future research direction.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [3.7891945838928223, 4.558401107788086], "id": 1024, "title": "Is a Question Decomposition Unit All We Need?", "authors": "Pruthvi Jayeshkumar Patel, Swaroop Mishra, Mihir Parmar and Chitta Baral", "abstract": "Large Language Models (LMs) have achieved state-of-the-art performance on many Natural Language Processing (NLP) benchmarks. With the growing number of new benchmarks, we build bigger and more complex LMs. However, building new LMs may not be an ideal option owing to the cost, time and environmental impact associated with it. We explore an alternative route: can we modify data by expressing it in terms of the model's strengths, so that a question becomes easier for models to answer? We investigate if humans can decompose a hard question into a set of simpler questions that are relatively easier for models to solve. We analyze a range of datasets involving various forms of reasoning and find that it is indeed possible to significantly improve model performance (24% for GPT3 and 29% for RoBERTa-SQuAD along with a symbolic calculator) via decomposition. Our approach provides a viable option to involve people in NLP research in a meaningful way. Our findings indicate that Human-in-the-loop Question Decomposition (HQD) can potentially provide an alternate path to building large LMs.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [5.245235919952393, 8.955718994140625], "id": 1031, "title": "Discourse-Aware Soft Prompting for Text Generation", "authors": "Marjan Ghazvininejad, Vladimir Karpukhin, Vera Gor and Asli Celikyilmaz", "abstract": "Current efficient fine-tuning methods\n(e.g., adapters, prefix-tuning, etc.) \nhave optimized conditional text generation via training a small set of extra parameters of the neural language model, while freezing the rest for efficiency. While showing strong performance on some generation tasks, they don't generalize across all generation tasks. We show that soft-prompt based conditional text generation can be improved with simple and efficient methods that simulate modeling the discourse structure of human written text.\nWe investigate two design choices: \nFirst, we apply hierarchical blocking on the prefix parameters to simulate a higher-level discourse structure of human written text. Second, we apply attention sparsity on the prefix parameters at different layers of the network and learn sparse transformations on the softmax-function. We show that structured design of prefix parameters yields more coherent, faithful and relevant generations than the baseline prefix-tuning on all generation tasks.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.837189674377441, 8.69967269897461], "id": 1040, "title": "ExPUNations: Augmenting Puns with Keywords and Explanations", "authors": "Jiao Sun, Anjali Narayan-Chen, Shereen Oraby, Alessandra Cervone, Tagyoung Chung, Jing Huang, Yang Liu and Nanyun Peng", "abstract": "The tasks of humor understanding and generation are challenging and subjective even for humans, requiring commonsense and real-world knowledge to master. Puns, in particular, add the challenge of fusing that knowledge with the ability to interpret lexical-semantic ambiguity. In this paper, we present the ExPUNations (ExPUN) dataset, in which we augment an existing dataset of puns with detailed crowdsourced annotations of keywords denoting the most distinctive words that make the text funny, pun explanations describing why the text is funny, and fine-grained funniness ratings. This is the first humor dataset with such extensive and fine-grained annotations specifically for puns. Based on these annotations, we propose two tasks: explanation generation to aid with pun classification and keyword-conditioned pun generation, to challenge the current state-of-the-art natural language understanding and generation models' ability to understand and generate humor. We showcase that the annotated keywords we collect are helpful for generating better novel humorous texts in human evaluation, and that our natural language explanations can be leveraged to improve both the accuracy and robustness of humor classifiers.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.813623428344727, 6.447241306304932], "id": 1041, "title": "SLING: Sino Linguistic Evaluation of Large Language Models", "authors": "Yixiao Song, Kalpesh Krishna, Rajesh Bhatt and Mohit Iyyer", "abstract": "To understand what kinds of linguistic knowledge are encoded by pretrained Chinese language models (LMs), we introduce the benchmark of Sino LINGuistics (SLING), which consists of 38K minimal sentence pairs in Mandarin Chinese grouped into 9 high-level linguistic phenomena. Each pair demonstrates the acceptability contrast of a specific syntactic or semantic phenomenon (e.g., The keys are lost vs. The keys is lost), and an LM should assign lower perplexity to the acceptable sentence. In contrast to the CLiMP dataset (Xiang et al., 2021), which also contains Chinese minimal pairs and was created by translating the vocabulary of the English BLiMP dataset, the minimal pairs in SLING are derived primarily by applying syntactic and lexical transformations to naturally-occurring, linguist-annotated sentences from the Chinese Treebank 9.0, thus addressing severe issues in CLiMP's data generation process. We test 18 publicly available pretrained monolingual (e.g., BERT-base-zh, CPM) and multi-lingual (e.g., mT5, XLM) language models on SLING. Our experiments show that the average accuracy for LMs is far below human performance (69.7% vs. 97.1%), while BERT-base-zh achieves the highest accuracy (84.8%) of all tested LMs, even much larger ones. Additionally, we find that most LMs have a strong gender and number (singular/plural) bias, and they perform better on local phenomena than hierarchical ones.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.3472514152526855, 8.667264938354492], "id": 1043, "title": "Context-Situated Pun Generation", "authors": "Jiao Sun, Anjali Narayan-Chen, Shereen Oraby, Shuyang Gao, Tagyoung Chung, Jing Huang, Yang Liu and Nanyun Peng", "abstract": "Previous work on pun generation commonly begins with a given pun word (a pair of homophones for heterographic pun generation and a polyseme for homographic pun generation) and seeks to generate an appropriate pun. While this may enable efficient pun generation, we believe that a pun is most entertaining if it fits appropriately within a given context, e.g., a given situation or dialogue. In this work, we propose a new task, context-situated pun generation, where a specific context represented by a set of keywords is provided, and the task is to first identify suitable pun words that are appropriate for the context, then generate puns based on the context keywords and the identified pun words. \nWe collect a new dataset, CUP (Context-sitUated Pun), containing 4.5k tuples of context words and pun pairs. Based on the new data and setup, we propose a pipeline system for context-situated pun generation, including a pun word retrieval module that identifies suitable pun words for a given context, and a pun generation module that generates puns from context keywords and pun words. \nHuman evaluation shows that 69% of our top retrieved pun words can be used to generate context-situated puns, and our generation module yields successful puns 31% of the time given a plausible tuple of context words and pun pair, almost tripling the yield of a state-of-the-art pun generation model. With an end-to-end evaluation, our pipeline system with the top-1 retrieved pun pair for a given context can generate successful puns 40% of the time, better than all other modeling variations but 32% lower than the human success rate. This highlights the difficulty of the task, and encourages more research in this direction.", "track": "Natural Language Generation", "label": 6}, {"loc": [2.616332530975342, 7.13837194442749], "id": 1044, "title": "Retrieval-Augmented Generative Question Answering for Event Argument Extraction", "authors": "Xinya Du and Heng Ji", "abstract": "Event argument extraction has long been studied as a sequential prediction problem with extractive-based methods, tackling each argument in isolation. Although recent work proposes generation-based methods to capture cross-argument dependency, they require generating and post-processing a complicated target sequence (template). Motivated by these observations and recent pretrained language models' capabilities of learning from demonstrations. We propose a retrieval-augmented generative QA model (R-GQA) for event argument extraction. It retrieves the most similar QA pair and augments it as prompt to the current example's context, then decodes the arguments as answers. Our approach outperforms substantially prior methods across various settings (i.e. fully supervised, domain transfer, and fewshot learning). Finally, we propose a clustering-based sampling strategy (JointEnc) and conduct a thorough analysis of how different strategies influence the few-shot learning performances.", "track": "Information Extraction", "label": 5}, {"loc": [5.041406631469727, 12.435675621032715], "id": 1052, "title": "Concadia: Towards Image-Based Text Generation with a Purpose", "authors": "Elisa Kreiss, Fei Fang, Noah Goodman and Christopher Potts", "abstract": "Current deep learning models often achieve excellent results on benchmark image-to-text datasets but fail to generate texts that are useful in practice. We argue that to close this gap, it is vital to distinguish descriptions from captions based on their distinct communicative roles. Descriptions focus on visual features and are meant to replace an image (often to increase accessibility), whereas captions appear alongside an image to supply additional information. To motivate this distinction and help people put it into practice, we introduce the publicly available Wikipedia-based dataset Concadia consisting of 96,918 images with corresponding English-language descriptions, captions, and surrounding context. Using insights from Concadia, models trained on it, and a preregistered human-subjects experiment with human- and model-generated texts, we characterize the commonalities and differences between descriptions and captions. In addition, we show that, for generating both descriptions and captions, it is useful to augment image-to-text models with representations of the textual context in which the image appeared.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [5.185880184173584, 12.415975570678711], "id": 1055, "title": "Context Matters for Image Descriptions for Accessibility: Challenges for Referenceless Evaluation Metrics", "authors": "Elisa Kreiss, Cynthia Bennett, Shayan Hooshmand, Eric Zelikman, Meredith Ringel Morris and Christopher Potts", "abstract": "Few images on the Web receive alt-text descriptions that would make them accessible to blind and low vision (BLV) users. Image-based NLG systems have progressed to the point where they can begin to address this persistent societal problem, but these systems will not be fully successful unless we evaluate them on metrics that guide their development correctly. Here, we argue against current referenceless metrics -- those that don't rely on human-generated ground-truth descriptions -- on the grounds that they do not align with the needs of BLV users. The fundamental shortcoming of these metrics is that they do not take context into account, whereas contextual information is highly valued by BLV users. To substantiate these claims, we present a study with BLV participants who rated descriptions along a variety of dimensions. An in-depth analysis reveals that the lack of context-awareness makes current referenceless metrics inadequate for advancing image accessibility. As a proof-of-concept, we provide a contextual version of the referenceless metric CLIPScore which begins to address the disconnect to the BLV data.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.461729526519775, 4.284296989440918], "id": 1058, "title": "MetaLogic: Logical Reasoning Explanations with Fine-Grained Structure", "authors": "Yinya Huang, Hongming Zhang, Ruixin Hong, Xiaodan Liang, Changshui Zhang and Dong Yu", "abstract": "In this paper, we propose a comprehensive benchmark to investigate models' logical reasoning capabilities in complex real-life scenarios. Current explanation datasets often employ synthetic data with simple reasoning structures. Therefore, it cannot express more complex reasoning processes, such as the rebuttal to a reasoning step and the degree of certainty of the evidence. To this end, we propose a comprehensive logical reasoning explanation form. Based on the multi-hop chain of reasoning, the explanation form includes three main components: (1) The condition of rebuttal that the reasoning node can be challenged; (2) Logical formulae that uncover the internal texture of reasoning nodes; (3) Reasoning strength indicated by degrees of certainty. The fine-grained structure conforms to the real logical reasoning scenario, better fitting the human cognitive process but, simultaneously, is more challenging for the current models. We evaluate the current best models' performance on this new explanation form. The experimental results show that generating reasoning graphs remains a challenging task for current models, even with the help of giant pre-trained language models.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.296556234359741, 4.484858989715576], "id": 1061, "title": "Explicit Query Rewriting for Conversational Dense Retrieval", "authors": "Hongjin Qian and Zhicheng Dou", "abstract": "In a conversational search scenario, a query might be context-dependent because some words are referred to previous expressions or omitted. Previous works tackle the issue by either reformulating the query into a self-contained query (query rewriting) or learning a contextualized query embedding from the query context (context modelling). In this paper, we propose a model CRDR that can perform query rewriting and context modelling in a unified framework in which the query rewriting's supervision signals further enhance the context modelling. Instead of generating a new query, CRDR only performs necessary modifications on the original query, which improves both accuracy and efficiency of query rewriting. In the meantime, the query rewriting benefits the context modelling by explicitly highlighting relevant terms in the query context, which improves the quality of the learned contextualized query embedding. To verify the effectiveness of CRDR, we perform comprehensive experiments on TREC CAsT-19 and TREC CAsT-20 datasets, and the results show that our method outperforms all baseline models in terms of both quality of query rewriting and quality of context-aware ranking.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.020330429077148, 5.715707778930664], "id": 1064, "title": "Efficient Nearest Neighbor Emotion Classification with BERT-whitening", "authors": "Wenbiao Yin and Lin Shang", "abstract": "Retrieval-based methods have been proven effective in many NLP tasks. Previous methods use representations from the pre-trained model for similarity search directly. However, the sentence representations from the pre-trained model like BERT perform poorly in retrieving semantically similar sentences, resulting in poor performance of the retrieval-based methods. In this paper, we propose kNN-EC, a simple and efficient non-parametric emotion classification (EC) method using nearest neighbor retrieval. We use BERT-whitening to get better sentence semantics, ensuring that nearest neighbor retrieval works. Meanwhile, BERT-whitening can also reduce memory storage of datastore and accelerate retrieval speed, solving the efficiency problem of the previous methods. kNN-EC average improves the pre-trained model by 1.17 F1-macro on two emotion classification datasets.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [7.309751033782959, 9.64592170715332], "id": 1066, "title": "FastClass: A Time-Efficient Approach to Weakly-Supervised Text Classification", "authors": "Tingyu Xia, Yue Wang, Yuan Tian and Yi Chang", "abstract": "Weakly-supervised text classification aims to train a classifier using only class descriptions and unlabeled data. Recent research shows that keyword-driven methods can achieve state-of-the-art performance on various tasks. However, these methods not only rely on carefully-crafted class descriptions to obtain class-specific keywords but also require substantial amount of unlabeled data and takes a long time to train. This paper proposes FastClass, an efficient weakly-supervised classification approach. It uses dense text representation to retrieve class-relevant documents from external unlabeled corpus and selects an optimal subset to train a classifier. Compared to keyword-driven methods, our approach is less reliant on initial class descriptions as it no longer needs to expand each class description into a set of class-specific keywords.Experiments on a wide range of classification tasks show that the proposed approach frequently outperforms keyword-driven models in terms of classification accuracy and often enjoys orders-of-magnitude faster training speed.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [7.750030517578125, 7.052759170532227], "id": 1075, "title": "Neural-Symbolic Inference for Robust Autoregressive Graph Parsing via Compositional Uncertainty Quantification", "authors": "Zi Lin, Jeremiah Liu and Jingbo Shang", "abstract": "Pre-trained seq2seq models excel at graph semantic parsing with rich annotated data, but generalize worse to out-of-distribution (OOD) and long-tail examples. In comparison, symbolic parsers under-perform on population-level metrics, but exhibit unique strength in OOD and tail generalization. In this work, we study compositionality-aware approach to neural-symbolic inference informed by model confidence, performing fine-grained neural-symbolic reasoning at subgraph level (i.e., nodes and edges) and precisely targeting subgraph components with high uncertainty in the neural parser. As a result, the method combines the distinct strength of the neural and symbolic approaches in capturing different aspects of the graph prediction, leading to well-rounded generalization performance both across domains and in the tail. We empirically investigate the approach in the English Resource Grammar (ERG) parsing problem on a diverse suite of standard in-domain and seven OOD corpora. Our approach leads to 35.26% and 35.60% error reduction in aggregated SMATCH score over neural and symbolic approaches respectively, and 14% absolute accuracy gain in key tail linguistic categories over the neural model, outperforming prior state-of-art methods that do not account for compositionality or uncertainty.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [2.8263230323791504, 8.70888900756836], "id": 1080, "title": "A Speaker-Aware Co-Attention Framework for Medical Dialogue Information Extraction", "authors": "Yuan Xia, Zhenhui Shi, Jingbo Zhou, Jiayu Xu, Chao Lu, Yehui Yang, Lei Wang, Haifeng Huang, Xia Zhang and Junwei Liu", "abstract": "With the development of medical digitization, the extraction and structuring of Electronic Medical Records (EMRs) have become challenging but fundamental tasks. How to accurately and automatically extract structured information from medical dialogues is especially difficult because the information needs to be inferred from complex interactions between the doctor and the patient. To this end, in this paper, we propose a speaker-aware co-attention framework for medical dialogue information extraction. To better utilize the pre-trained language representation model to perceive the semantics of the utterance and the candidate item, we develop a speaker-aware dialogue encoder with multi-task learning, which considers the speaker's identity into account. To deal with complex interactions between different utterances and the correlations between utterances and candidate items, we propose a co-attention fusion network to aggregate the utterance information. We evaluate our framework on the public medical dialogue extraction datasets to demonstrate the superiority of our method, which can outperform the state-of-the-art methods by a large margin. Codes will be publicly available upon acceptance.", "track": "NLP Applications", "label": 0}, {"loc": [4.889120101928711, 4.7337141036987305], "id": 1085, "title": "Towards Interactivity and Interpretability: A Rationale-based Legal Judgment Prediction Framework", "authors": "Yiquan Wu, Yifei Liu, Weiming Lu, Yating Zhang, Jun Feng, Changlong Sun, Fei Wu and Kun Kuang", "abstract": "Legal judgment prediction (LJP) is a fundamental task in legal AI, which aims to assist the judge to hear the case and determine the judgment. The legal judgment usually consists of the law article, charge, and term of penalty. In the real trial scenario, the judge usually makes the decision step-by-step: first concludes the rationale according to the case's facts and then determines the judgment. Recently, many models have been proposed and made tremendous progress in LJP, but most of them adopt an end-to-end manner that cannot be manually intervened by the judge for practical use. Moreover, existing models lack interpretability due to the neglect of rationale in the prediction process. Following the judge's real trial logic, in this paper, we propose a novel Rationale-based Legal Judgment Prediction (RLJP) framework. In the RLJP framework, the LJP process is split into two steps. In the first phase, the model generates the rationales according to the fact description. Then it predicts the judgment based on the fact and the generated rationales. Extensive experiments on a real-world dataset show RLJP achieves the best results compared to the state-of-the-art models. Meanwhile, the proposed framework provides good interactivity and interpretability which enables practical use.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.44679594039917, 12.414607048034668], "id": 1098, "title": "RelCLIP: Adapting Language-Image Pretraining for Visual Relationship Detection via Relational Contrastive Learning", "authors": "Yi Zhu, Zhaoqing Zhu, Bingqian Lin, Xiaodan Liang, Feng Zhao and Jianzhuang Liu", "abstract": "Conventional visual relationship detection models only use the numeric ids of relation labels for training, but ignore the semantic correlation between the labels, which leads to severe training biases and harms the generalization ability of representations. In this paper, we introduce compact language information of relation labels for regularizing the representation learning of visual relations. Specifically, we propose a simple yet effective visual Relationship prediction framework that transfers natural language knowledge learned from Contrastive Language-Image Pre-training (CLIP) models to enhance the relationship prediction, termed RelCLIP. Benefiting from the powerful visual-semantic alignment ability of CLIP at image level, we introduce a novel Relational Contrastive Learning (RCL) approach which explores relation-level visual-semantic alignment via learning to match cross-modal relational embeddings. By collaboratively learning the semantic coherence and discrepancy from relation triplets, the model can generate more discriminative and robust representations. Experimental results on the Visual Genome dataset show that RelCLIP achieves significant improvements over strong baselines under full (provide accurate labels) and distant supervision (provide noise labels), demonstrating its powerful generalization ability in learning relationship representations. Code will be available at https://gitee.com/mindspore/models/tree/master/research/cv/RelCLIP.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.596184730529785, 7.618668079376221], "id": 1105, "title": "Candidate Soups: Fusing Candidate Results Improves Translation Quality for Non-Autoregressive Translation", "authors": "Huanran Zheng, Wei Zhu, Pengfei Wang and Xiaoling Wang", "abstract": "Non-autoregressive translation (NAT) model achieves a much faster inference speed than the autoregressive translation (AT) model because it can simultaneously predict all tokens during inference. However, its translation quality suffers from degradation compared to AT. And existing NAT methods only focus on improving the NAT model's performance but do not fully utilize it. In this paper, we propose a simple but effective method called \"Candidate Soups,\u201d which can obtain high-quality translations while maintaining the inference speed of NAT models. Unlike previous approaches that pick the individual result and discard the remainders, Candidate Soups (CDS) can fully use the valuable information in the different candidate translations through model uncertainty. Extensive experiments on two benchmarks (WMT'14 EN\u2013DE and WMT'16 EN\u2013RO) demonstrate the effectiveness and generality of our proposed method, which can significantly improve the translation quality of various base models. More notably, our best variant outperforms the AT model on three translation tasks with 7.6\u00d7 speedup.", "track": "Machine Translation", "label": 10}, {"loc": [8.58010482788086, 8.370965957641602], "id": 1118, "title": "Evaluating Parameter Efficient Learning for Generation", "authors": "Peng Xu, Mostofa Patwary, Shrimai Prabhumoye, Virginia Adams, Ryan J. Prenger, Wei Ping, Nayeon Lee, Mohammad Shoeybi and Bryan Catanzaro", "abstract": "Parameter efficient learning methods (PERMs)\nhave recently gained significant attention as\nthey provide an efficient way for pre-trained\nlanguage models (PLMs) to adapt to a downstream task. However, these conclusions are\nmostly drawn from in-domain evaluations over\nthe full training set. In this paper, we present\ncomparisons between PERMs and finetuning\nfrom three new perspectives: (1) the effect of\nsample and model size to in-domain evaluations, (2) generalization to unseen domains and\nnew datasets, and (3) the faithfulness of generations. Our results show that for in-domain\nsettings (a) there is a cross point of sample\nsize for which PERMs will perform better than\nfinetuning when training with fewer samples,\nand (b) larger PLMs have larger cross points.\nFor cross-domain and cross-dataset cases, we\nshow that (a) Adapter (Houlsby et al., 2019)\nperforms the best amongst all the PERMs studied here, and (b) it outperforms finetuning if\nthe task dataset is below a certain size. We\nalso compare the faithfulness of generations\nand show that PERMs can achieve better faithfulness score than finetuning, especially for\nsmall training set, by as much as 6%. Finally,\nwe apply Adapter to MT-NLG 530b (Smith\net al., 2022) and achieve new state-of-the-art\nresults on Xsum (Narayan et al., 2018) for all\nROUGE scores (ROUGE-1 49.17, ROUGE-2\n27.20, ROUGE-L 40.98).", "track": "Natural Language Generation", "label": 6}, {"loc": [5.636509418487549, 12.1234769821167], "id": 1119, "title": "McQueen: a Benchmark for Multimodal Conversational Query Rewrite", "authors": "Yifei Yuan, Chen Shi, Runze Wang, Liyi Chen, Feijun Jiang, Yuan You and Wai Lam", "abstract": "The task of query rewrite aims to convert an in-context query to its fully-specified version where ellipsis and coreference are completed and referred-back according to the history context. Although much progress has been made, less efforts have been paid to real scenario conversations that involve drawing information from more than one modalities. In this paper, we propose the task of multimodal conversational query rewrite (McQR), which performs query rewrite under the multimodal visual conversation setting. We collect a large-scale dataset named McQueen based on manual annotation, which contains 15k visual conversations and over 80k queries where each one is associated with a fully-specified rewrite version. In addition, for entities appearing in the rewrite, we provide the corresponding image box annotation. We then use the McQueen dataset to benchmark a state-of-the-art method for effectively tackling the McQR task, which is based on a multimodal pre-trained model with pointer generator. Extensive experiments are performed to demonstrate the effectiveness of our model on this task.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.541003704071045, 8.941580772399902], "id": 1126, "title": "Self-supervised Graph Masking Pre-training for Graph-to-Text Generation", "authors": "Jiuzhou Han and Ehsan Shareghi", "abstract": "Large-scale pre-trained language models (PLMs) have advanced Graph-to-Text (G2T) generation by processing the linearised version of a graph. However, the linearisation is known to ignore the structural information. Additionally, PLMs are typically pre-trained on free text which introduces domain mismatch between pre-training and downstream G2T generation tasks. To address these shortcomings, we propose graph masking pre-training strategies that neither require supervision signals nor adjust the architecture of the underlying pre-trained encoder-decoder model. When used with a pre-trained T5, our approach achieves new state-of-the-art results on WebNLG+2020 and EventNarrative G2T generation datasets. Our method also shows to be very effective in the low-resource setting.", "track": "Natural Language Generation", "label": 6}, {"loc": [8.798027992248535, 8.371423721313477], "id": 1127, "title": "Improving Stability of Fine-Tuning Pretrained Language Models via Component-Wise Gradient Norm Clipping", "authors": "Chenghao Yang and Xuezhe Ma", "abstract": "Fine-tuning over large pretrained language models (PLMs) has established many state-of-the-art results. \nDespite its superior performance, such fine-tuning can be unstable, resulting in significant variance in performance and potential risks for practical applications. Previous works have attributed such instability to the catastrophic forgetting problem in the top layers of PLMs, which indicates iteratively fine-tuning layers in a top-down manner is a promising solution. In this paper, we first point out that this method does not always work out due to the different convergence speeds of different layers/modules. Inspired by this observation, we propose a simple component-wise gradient norm clipping method to adjust the convergence speed for different components. Experiment results demonstrate that our method achieves consistent improvements in terms of generalization performance, convergence speed, and training stability. The codebase can be found at https://github.com/yangalan123/FineTuningStability.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.11702823638916, 3.013378620147705], "id": 1147, "title": "Differentially Private Language Models for Secure Data Sharing", "authors": "Justus Mattern, Zhijing Jin, Benjamin Weggenmann, Mrinmaya Sachan and Bernhard Schoelkopf", "abstract": "To protect the privacy of individuals whose data is being shared, it is of high importance to develop methods allowing researchers and companies to release textual data while providing formal privacy guarantees to its originators. In the field of NLP, substantial efforts have been directed at building mechanisms following the framework of local differential privacy, thereby anonymizing individual text samples before releasing them. In practice, these approaches are often dissatisfying in terms of the quality of their output language due to the strong noise required for local differential privacy. In this paper, we approach the problem at hand using global differential privacy, particularly by training a generative language model in a differentially private manner and consequently sampling data from it. Using natural language prompts and a new prompt-mismatch loss, we are able to create highly accurate and fluent textual datasets taking on specific desired attributes such as sentiment or topic and resembling statistical properties of the training data. We perform thorough experiments indicating that our synthetic datasets do not leak information from our original data and are of high language quality and highly suitable for training models for further analysis on real-world data. Notably, we also demonstrate that training classifiers on private synthetic data outperforms directly training classifiers with DP-SGD.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.886735439300537, 8.729007720947266], "id": 1150, "title": "Conditional set generation using Seq2seq models", "authors": "Aman Madaan, Dheeraj Rajagopal, Niket Tandon, Yiming Yang and Antoine Bosselut", "abstract": "Conditional set generation learns a mapping from an input sequence of tokens to a set. Several NLP tasks, such as entity typing and dialogue emotion tagging, are instances of set generation. Seq2Seq models are a popular choice to model set generation but they treat a set as a sequence and do not fully leverage its key properties, namely order-invariance and cardinality. We propose a novel algorithm for effectively sampling informative orders over the combinatorial space of label orders. Further, we jointly model the set cardinality and output by listing the set size as the first element and taking advantage of the autoregressive factorization used by Seq2Seq models. Our method is a model-independent data augmentation approach that endows any Seq2Seq model with the signals of order-invariance and cardinality. Training a Seq2Seq model on this new augmented data~(without any additional annotations), gets an average relative improvement of 20% for four benchmarks datasets across models spanning from BART-base, T5-11B, and GPT-3. We will release all code and data upon acceptance.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.6834402084350586, 8.141483306884766], "id": 1158, "title": "Analyzing and Evaluating Faithfulness in Dialogue Summarization", "authors": "Bin Wang, Chen Zhang, Yan Zhang, Yiming Chen and Haizhou Li", "abstract": "Dialogue summarization is abstractive in nature, making it suffer from factual errors. The factual correctness of summaries has the highest priority before practical applications. Many efforts have been made to improve faithfulness in text summarization. However, there is a lack of systematic study on dialogue summarization systems. In this work, we first perform the fine-grained human analysis on the faithfulness of dialogue summaries and observe that over 35% of generated summaries are faithfully inconsistent respective the source dialogues. Furthermore, we present a new model-level faithfulness evaluation method. It examines generation models with multi-choice questions created by rule-based transformations. Experimental results show that our evaluation schema is a strong proxy for the factual correctness of summarization models. The human-annotated faithfulness samples and the evaluation toolkit are released to facilitate future research toward faithful dialogue summarization.", "track": "Summarization", "label": 14}, {"loc": [5.488138675689697, 8.956270217895508], "id": 1169, "title": "Twist Decoding: Diverse Generators Guide Each Other", "authors": "Jungo Kasai, Keisuke Sakaguchi, Ronan Le Bras, Hao Peng, Ximing Lu, Dragomir Radev, Yejin Choi and Noah A. Smith", "abstract": "Many language generation models are now available for a wide range of generation tasks, including machine translation and summarization. Combining such diverse models may lead to further progress, but ensembling generation models is challenging during inference: conventional ensembling methods (e.g., shallow fusion) require that the models share vocabulary/tokenization schemes. We introduce Twist decoding, a simple and general text generation algorithm that benefits from diverse models at inference time. Our method does not assume the vocabulary, tokenization or even generation order is shared. Our extensive evaluations on machine translation and scientific paper summarization demonstrate that Twist decoding substantially outperforms each model decoded in isolation over various scenarios, including cases where domain-specific and general-purpose models are both available. Twist decoding also consistently outperforms the popular reranking heuristic where output candidates from one model are rescored by another. We hope that our work will encourage researchers and practitioners to examine generation models collectively, not just independently, and to seek out models with complementary strengths to the currently available models.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.5302252769470215, 1.8606117963790894], "id": 1173, "title": "Exploring Representation-level Augmentation for Code Search", "authors": "Haochen Li, Chunyan Miao, Cyril Leung, Yanxian Huang, Yuan Huang, Hongyu Zhang and Yanlin Wang", "abstract": "Code search, which aims at retrieving the most relevant code fragment for a given natural language query, is a common activity in software development practice. Recently, contrastive learning is widely used in code search research, where many data augmentation approaches for source code (e.g., semantic-preserving program transformation) are proposed to learn better representations. However, these augmentations are at the raw-data level, which requires additional code analysis in the preprocessing stage and additional training cost in the training stage. In this paper, we explore augmentation methods that augment data (both code and query) at representation level which does not require additional data processing and training, and based on this we propose a general format of representation-level augmentation that unifies existing methods. Then, we propose three new augmentation methods (linear extrapolation, binary interpolation, and Gaussian scaling) based on the general format. Furthermore, we theoretically analyze the advantages of the proposed augmentation methods over traditional contrastive learning methods on code search. We experimentally evaluate the proposed representation-level augmentation methods with state-of-the-art code search models on a large-scale public dataset consisting of six programming languages. The experimental results show that our approach can consistently boost the performance of the studied code search models.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.992883682250977, 5.770003795623779], "id": 1187, "title": "Learning Semantic Textual Similarity via Topic-informed Discrete Latent Variables", "authors": "Erxin Yu, Lan Du, YUAN JIN, Zhepei Wei and Yi Chang", "abstract": "Recently, discrete latent variable models have received a surge of interest in both Natural Language Processing (NLP) and Computer Vision (CV), attributed to their comparable performance to the continuous counterparts in representation learning, while being more interpretable in their predictions. In this paper, we develop a topic-informed discrete latent variable model for semantic textual similarity, which learns a shared latent space for sentence-pair representation via vector quantization. Compared with previous models limited to local semantic contexts, our model can explore richer semantic information via topic modeling. We further boost the performance of semantic similarity by injecting the quantized representation into a transformer-based language model with a well-designed semantic-driven attention mechanism. We demonstrate, through extensive experiments across various English language datasets, that our model is able to surpass several strong neural baselines in semantic textual similarity tasks.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.7054240703582764, 7.962756156921387], "id": 1201, "title": "STRUDEL: Structured Dialogue Summarization for Dialogue Comprehension", "authors": "Borui Wang, Chengcheng Feng, Arjun Nair, Madelyn Mao, Jai Desai, Asli Celikyilmaz, Haoran Li, Yashar Mehdad and Dragomir Radev", "abstract": "Abstractive dialogue summarization has long been viewed as an important standalone task in natural language processing, but no previous work has explored the possibility of whether abstractive dialogue summarization can also be used as a means to boost an NLP system's performance on other important dialogue comprehension tasks. In this paper, we propose a novel type of dialogue summarization task - STRUctured DiaLoguE Summarization (STRUDEL) - that can help pre-trained language models to better understand dialogues and improve their performance on important dialogue comprehension tasks. In contrast to the holistic approach taken by the traditional free-form abstractive summarization task for dialogues, STRUDEL aims to decompose and imitate the hierarchical, systematic and structured mental process that we human beings usually go through when understanding and analyzing dialogues, and thus has the advantage of being more focused, specific and instructive for dialogue comprehension models to learn from. We further introduce a new STRUDEL dialogue comprehension modeling framework that integrates STRUDEL into a dialogue reasoning module over transformer encoder language models to improve their dialogue comprehension ability. In our empirical experiments on two important downstream dialogue comprehension tasks - dialogue question answering and dialogue response prediction - we demonstrate that our STRUDEL dialogue comprehension models can significantly improve the dialogue comprehension performance of transformer encoder language models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.85542106628418, 6.84752082824707], "id": 1206, "title": "Competency-Aware Neural Machine Translation: Can Machine Translation Know its Own Translation Quality?", "authors": "Pei Zhang, Baosong Yang, Hao-Ran Wei, Dayiheng Liu, Kai Fan, Luo Si and Jun Xie", "abstract": "Neural machine translation (NMT) is often criticized for failures that happen\nwithout awareness. \nThe lack of competency awareness makes NMT untrustworthy. \nThis is in sharp contrast to human translators who give feedback or conduct further investigations whenever they are in doubt about predictions. \nTo fill this gap, we propose a novel competency-aware NMT by extending conventional NMT with a self-estimator, offering abilities to translate a source sentence and estimate its competency.\nThe self-estimator encodes the information of the decoding procedure and then examines whether it can reconstruct the original semantics of the source sentence. \nExperimental results on four translation tasks demonstrate that the proposed method not only carries out translation tasks intact but also delivers outstanding performance on quality estimation.\nWithout depending on any reference or annotated data typically required by state-of-the-art metric and quality estimation methods, our model yields an even higher correlation with human quality judgments than a variety of aforementioned methods, such as BLEURT, COMET, and BERTScore. \nQuantitative and qualitative analyses show better robustness of competency awareness in our model.\\footnote{Code and test sets are available at: https://github.com/xiaoyi0814/CANMT.}", "track": "Machine Translation", "label": 10}, {"loc": [4.4170427322387695, 4.983255386352539], "id": 1208, "title": "PASTA: Table-Operations Aware Fact Verification via Sentence-Table Cloze Pre-training", "authors": "Zihui Gu, Ju Fan, Nan Tang, Preslav Nakov, Xiaoman Zhao and Xiaoyong Du", "abstract": "Fact verification has attracted a lot of attention recently, e.g., in journalism, marketing, and policymaking, as misinformation and dis- information can sway one's opinion and affect one's actions. While fact-checking is a hard task in general, in many cases, false statements can be easily debunked based on analytics over tables with reliable information. Hence, table- based fact verification has recently emerged as an important and growing research area. Yet, progress has been limited due to the lack of datasets that can be used to pre-train language models (LMs) to be aware of common table operations, such as aggregating a column or comparing tuples. To bridge this gap, this paper introduces PASTA for table-based fact verification via pre-training with synthesized sentence\u2013table cloze questions. In particular, we design six types of common sentence\u2013table cloze tasks, including Filter, Aggregation, Superlative, Comparative, Ordinal, and Unique, based on which we synthesize a large corpus consisting of 1.2 million sentence\u2013table pairs from WikiTables. PASTA uses a recent pre-trained LM, DeBERTaV3, and further pre- trains it on our corpus. Our experimental results show that PASTA achieves new state-of-the-art (SOTA) performance on two table-based fact verification datasets TabFact and SEM-TAB- FACTS. In particular, on the complex set of TabFact, which contains multiple operations, PASTA largely outperforms previous SOTA by 4.7% (85.6% vs. 80.9%), and the gap between PASTA and human performance on the small test set is narrowed to just 1.5% (90.6% vs. 92.1%).", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [1.077452301979065, 10.55976676940918], "id": 1216, "title": "Sentiment-Aware Word and Sentence Level Pre-training for Sentiment Analysis", "authors": "Shuai Fan, Chen Lin, Haonan Li, Zhenghao Lin, Jinsong Su, Hang Zhang, Yeyun Gong, JIan Guo and Nan Duan", "abstract": "Most existing pre-trained language representation models (PLMs) are sub-optimal in sentiment analysis tasks, as they capture the sentiment information from word-level while under-considering sentence-level information. \nIn this paper, we propose SentiWSP, a novel Sentiment-aware pre-trained language model with combined Word-level and Sentence-level Pre-training tasks.\nThe word level pre-training task detects replaced sentiment words, via a generator-discriminator framework, to enhance the PLM's knowledge about sentiment words.\nThe sentence level pre-training task further strengthens the discriminator via a contrastive learning framework, with similar sentences as negative samples, to encode sentiments in a sentence.\nExtensive experimental results show that SentiWSP achieves new state-of-the-art performance on various sentence-level and aspect-level sentiment classification benchmarks. \nWe have made our code and model publicly available at https://github.com/XMUDM/SentiWSP.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [6.23079776763916, 12.178827285766602], "id": 1222, "title": "Towards Multi-Modal Sarcasm Detection via Hierarchical Congruity Modeling with Knowledge Enhancement", "authors": "Hui Liu, Wenya Wang and Haoliang Li", "abstract": "Sarcasm is a linguistic phenomenon indicating a discrepancy between literal meanings and implied intentions. Due to its sophisticated nature, it is usually difficult to be detected from the text itself. As a result, multi-modal sarcasm detection has received more and more attention in both academia and industries. However, most existing techniques only modeled the atomic-level inconsistencies between the text input and its accompanying image, ignoring more complex compositions for both modalities. Moreover, they neglected the rich information contained in external knowledge, e.g., image captions. In this paper, we propose a novel hierarchical framework for sarcasm detection by exploring both the atomic-level congruity based on multi-head cross attentions and the composition-level congruity based on graph neural networks, where a post with low congruity can be identified as sarcasm. In addition, we exploit the effect of various knowledge resources for sarcasm detection. Evaluation results on a public multi-modal sarcasm detection dataset based on Twitter demonstrate the superiority of our proposed model.", "track": "NLP Applications", "label": 0}, {"loc": [7.653224945068359, 8.598806381225586], "id": 1234, "title": "Efficiently Tuned Parameters Are Task Embeddings", "authors": "Wangchunshu Zhou, Canwen Xu and Julian McAuley", "abstract": "Intermediate-task transfer can benefit a wide range of NLP tasks with properly selected source datasets. However, it is computationally infeasible to experiment with all intermediate transfer combinations, making choosing a useful source task a challenging problem. In this paper, we anticipate that task-specific parameters updated in parameter-efficient tuning methods are likely to encode task-specific information. Therefore, such parameters can be predictive for inter-task transferability. Thus, we propose to exploit these efficiently tuned parameters as off-the-shelf task embeddings for the efficient selection of source datasets for intermediate-task transfer. We experiment with 11 text classification tasks and 11 question answering tasks. Experimental results show that our approach consistently outperforms existing inter-task transferability prediction methods while being conceptually simple and computationally efficient. Our analysis also reveals that the ability of efficiently tuned parameters on transferability prediction is disentangled with their in-task performance. This allows us to use parameters from early checkpoints as task embeddings to further improve efficiency.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.997520923614502, 3.6511054039001465], "id": 1242, "title": "COPEN: Probing Conceptual Knowledge in Pre-trained Language Models", "authors": "Hao Peng, Xiaozhi Wang, Shengding Hu, Hailong Jin, Lei Hou, Juanzi Li, Zhiyuan Liu and Qun Liu", "abstract": "Conceptual knowledge is fundamental to human cognition and knowledge bases. However, existing knowledge probing works only focus on evaluating factual knowledge of pre-trained language models (PLMs) and ignore conceptual knowledge. Since conceptual knowledge often appears as implicit commonsense behind texts, designing probes for conceptual knowledge is hard. Inspired by knowledge representation schemata, we comprehensively evaluate conceptual knowledge of PLMs by designing three tasks to probe whether PLMs organize entities by conceptual similarities, learn conceptual properties, and conceptualize entities in contexts, respectively. For the tasks, we collect and annotate 24k data instances covering 393 concepts, which is COPEN, a COnceptual knowledge Probing bENchmark. Extensive experiments on different sizes and types of PLMs show that existing PLMs systematically lack conceptual knowledge and suffer from various spurious correlations. We believe this is a critical bottleneck for realizing human-like cognition in PLMs. COPEN and our codes are publicly released at https://github.com/THU-KEG/COPEN.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [1.7433269023895264, 5.299554824829102], "id": 1245, "title": "Capturing Global Structural Information in Long Document Question Answering with Compressive Graph Selector Network", "authors": "Yuxiang Nie, Heyan Huang, Wei Wei and Xian-Ling Mao", "abstract": "Long document question answering is a challenging task due to its demands for complex reasoning over long text. Previous works usually take long documents as non-structured flat texts or only consider the local structure in long documents. However, these methods usually ignore the global structure of the long document, which is essential for long-range understanding. To tackle this problem, we propose Compressive Graph Selector Network (CGSN) to capture the global structure in a compressive and iterative manner. The proposed model mainly focuses on the evidence selection phase of long document question answering. Specifically, it consists of three modules: local graph network, global graph network and evidence memory network. Firstly, the local graph network builds the graph structure of the chunked segment in token, sentence, paragraph and segment levels to capture the short-term dependency of the text. Secondly, the global graph network selectively receives the information of each level from the local graph, compresses them into the global graph nodes and applies graph attention to the global graph nodes to build the long-range reasoning over the entire text in an iterative way. Thirdly, the evidence memory network is designed to alleviate the redundancy problem in the evidence selection by saving the selected result in the previous steps. Extensive experiments show that the proposed model outperforms previous methods on two datasets.", "track": "Question Answering", "label": 11}, {"loc": [7.311715126037598, 6.992787837982178], "id": 1247, "title": "Structural generalization is hard for sequence-to-sequence models", "authors": "Yuekun Yao and Alexander Koller", "abstract": "Sequence-to-sequence (seq2seq) models have been successful across many NLP tasks,\nincluding ones that require predicting linguistic structure. However, recent work on compositional generalization has shown that seq2seq models achieve very low accuracy in generalizing to linguistic structures that were not seen in training. We present new evidence that this is a general limitation of seq2seq models that is present not just in semantic parsing, but also in syntactic parsing and in text-to-text tasks, and that this limitation can often be overcome by neurosymbolic models that have linguistic knowledge built in. We further report on some experiments that give initial answers on the reasons for these limitations.", "track": "Theme Track", "label": 18}, {"loc": [5.789286136627197, 8.85915470123291], "id": 1250, "title": "Contrastive Learning enhanced Author-Style Headline Generation", "authors": "Hui Liu, Weidong Guo, Yige Chen and Xiangyang Li", "abstract": "Headline generation is a task of generating an appropriate headline for a given article, which can be further used for machine-aided writing or enhancing the click-through ratio. Current works only use the article itself in the generation, but have not taken the writing style of headlines into consideration. In this paper, we propose a novel Seq2Seq model called CLH3G (Contrastive Learning enhanced Historical Headlines based Headline Generation) which can use the historical headlines of the articles that the author wrote in the past to improve the headline generation of current articles. By taking historical headlines into account, we can integrate the stylistic features of the author into our model, and generate a headline not only appropriate for the article, but also consistent with the author's style. In order to efficiently learn the stylistic features of the author, we further introduce a contrastive learning based auxiliary task for the encoder of our model. Besides, we propose two methods to use the learned stylistic features to guide both the pointer and the decoder during the generation. Experimental results show that historical headlines of the same user can improve the headline generation significantly, and both the contrastive learning module and the two style features fusion methods can further boost the performance.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.580215454101562, 7.632606029510498], "id": 1251, "title": "Multi-Granularity Optimization for Non-Autoregressive Translation", "authors": "Yafu Li, Leyang Cui, Yongjing Yin and Yue Zhang", "abstract": "Despite low latency, non-autoregressive machine translation (NAT) suffers severe performance deterioration due to the naive independence assumption. This assumption is further strengthened by cross-entropy loss, which encourages a strict match between the hypothesis and the reference token by token. To alleviate this issue, we propose multi-granularity optimization for NAT, which collects model behaviours on translation segments of various granularities and integrates feedback for backpropagation. Experiments on four WMT benchmarks show that the proposed method significantly outperforms the baseline models trained with cross-entropy loss, and achieves the best performance on WMT'16 En\u21d4Ro and highly competitive results on WMT'14 En\u21d4De for fully non-autoregressive translation.", "track": "Machine Translation", "label": 10}, {"loc": [7.714637279510498, 8.228096008300781], "id": 1252, "title": "Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks", "authors": "Yizhong Wang, Swaroop Mishra, Pegah Alipoormolabashi, Yeganeh Kordi, Amirreza Mirzaei, Atharva Naik, Arjun Ashok, Arut Selvan Dhanasekaran, Anjana Arunkumar, David Stap, Eshaan Pathak, Giannis Karamanolakis, Haizhi Gary Lai, Ishan Virendrabhai Purohit, Ishani Mondal, Jacob William Anderson, Kirby C. Kuznia, Krima Doshi, Kuntal Kumar Pal, Maitreya Patel, Mehrad Moradshahi, Mihir Parmar, Mirali Purohit, Neeraj Varshney, Phani Rohitha Kaza, Pulkit Verma, Ravsehaj Singh Puri, rushang karia, Savan Doshi, Shailaja Keyur Sampat, Siddhartha Mishra, Sujan Reddy A, Sumanta Patro, Tanay Dixit, Xudong Shen, Chitta Baral, Yejin Choi, Noah A. Smith, Hannaneh Hajishirzi and Daniel Khashabi", "abstract": "How well can NLP models generalize to a variety of unseen tasks when provided with task instructions? To address this question, we first introduce Super-NaturalInstructions, a benchmark of 1,616 diverse NLP tasks and their expert-written instructions. Our collection covers 76 distinct task types, including but not limited to classification, extraction, infilling, sequence tagging, text rewriting, and text composition. This large and diverse collection of tasks enables rigorous benchmarking of cross-task generalization under instructions---training models to follow instructions on a subset of tasks and evaluating them on the remaining unseen ones.\nFurthermore, we build Tk-Instruct, a transformer model trained to follow a variety of in-context instructions (plain language task definitions or k-shot examples). Our experiments show that Tk-Instruct outperforms existing instruction-following models such as InstructGPT by over 9% on our benchmark despite being an order of magnitude smaller. We further analyze generalization as a function of various scaling parameters, such as the number of observed tasks, the number of instances per task, and model sizes. We hope our dataset and model facilitate future progress towards more general-purpose NLP models.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.4904191195964813, 7.1760172843933105], "id": 1256, "title": "MetaFill: Text Infilling for Meta-Path Generation on Heterogeneous Information Networks", "authors": "Zequn Liu, Kefei Duan, Junwei Yang, Hanwen Xu, Ming Zhang and Sheng Wang", "abstract": "Heterogeneous information network (HIN) is essential to study complicated networks containing multiple edge types and node types. Meta-path, a sequence of node types and edge types, is the core technique to embed HINs. Since manually curating meta-paths is time-consuming, there is a pressing need to develop automated meta-path generation approaches. Existing meta-path generation approaches cannot fully exploit the rich textual information in HINs, such as node names and edge type names. To address this problem, we propose MetaFill, a text-infilling-based approach for meta-path generation. The key idea of MetaFill is to formulate meta-path identification problem as a word sequence infilling problem, which can be advanced by pretrained language models (PLMs). We observed the superior performance of MetaFill against existing meta-path generation methods and graph embedding methods that do not leverage meta-paths in both link prediction and node classification on two real-world HIN datasets. We further demonstrated how MetaFill can accurately classify edges in the zero-shot setting, where existing approaches cannot generate any meta-paths. MetaFill exploits PLMs to generate meta-paths for graph embedding, opening up new avenues for language model applications in graph analysis.", "track": "NLP Applications", "label": 0}, {"loc": [1.740719199180603, 5.443744659423828], "id": 1258, "title": "DRLK: Dynamic Hierarchical Reasoning with Language Model and Knowledge Graph for Question Answering", "authors": "Miao Zhang, Rufeng Dai, Ming Dong and Tingting He", "abstract": "In recent years, Graph Neural Network (GNN) approaches with enhanced knowledge graphs (KG) perform well in question answering (QA) tasks. One critical challenge is how to effectively utilize interactions between the QA context and KG. However, existing work only adopts the identical QA context representation to interact with multiple layers of KG, which results in a restricted interaction. In this paper, we propose DRLK (Dynamic Hierarchical Reasoning with Language Model and Knowledge Graphs), a novel model that utilizes dynamic hierarchical interactions between the QA context and KG for reasoning. DRLK extracts dynamic hierarchical features in the QA context, and performs inter-layer and intra-layer interactions on each iteration, allowing the KG representation to be grounded with the hierarchical features of the QA context. We conduct extensive experiments on four benchmark datasets in medical QA and commonsense reasoning. The experimental results demonstrate that DRLK achieves state-of-the-art performances on two benchmark datasets and performs competitively on the others.", "track": "Question Answering", "label": 11}, {"loc": [3.522233486175537, 7.451706409454346], "id": 1270, "title": "AEG: Argumentative Essay Generation via A Dual-Decoder Model with Content Planning", "authors": "Jianzhu Bao, Yasheng Wang, Yitong Li, Fei Mi and Ruifeng Xu", "abstract": "Argument generation is an important but challenging task in computational argumentation.\nExisting studies have mainly focused on generating individual short arguments, while research on generating long and coherent argumentative essays is still under-explored.\nIn this paper, we propose a new task, Argumentative Essay Generation (AEG).\nGiven a writing prompt, the goal of AEG is to automatically generate an argumentative essay with strong persuasiveness.\nWe construct a large-scale dataset, ArgEssay, for this new task and establish a strong model based on a dual-decoder Transformer architecture.\nOur proposed model contains two decoders, a planning decoder (PD) and a writing decoder (WD), where PD is used to generate a sequence for essay content planning and WD incorporates the planning information to write an essay.\nFurther, we pre-train this model on a large news dataset to enhance the plan-and-write paradigm.\nAutomatic and human evaluation results show that our model can generate more coherent and persuasive essays with higher diversity and less repetition compared to several baselines.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.268440246582031, 7.363239765167236], "id": 1271, "title": "BotsTalk: Machine-sourced Framework for Automatic Curation of Large-scale Multi-skill Dialogue Datasets", "authors": "Minju Kim, Chae Hyeong Kim, Yong Ho Song, Seung-won Hwang and Jinyoung Yeo", "abstract": "To build open-domain chatbots that are able to use diverse communicative skills, we propose a novel framework BotsTalk, where multiple agents grounded to the specific target skills participate in a conversation to automatically annotate multi-skill dialogues. We further present Blended Skill BotsTalk (BSBT), a large-scale multi-skill dialogue dataset comprising 300K conversations. Through extensive experiments, we demonstrate that our dataset can be effective for multi-skill dialogue systems which require an understanding of skill blending as well as skill grounding. Our code and data are available at https://github.com/convei-lab/BotsTalk.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.757055401802063, 9.10567855834961], "id": 1285, "title": "Wider & Closer: Mixture of Short-channel Distillers for Zero-shot Cross-lingual Named Entity Recognition", "authors": "Jun-Yu Ma, Beiduo Chen, Jia-Chen Gu, Zhenhua Ling, Wu Guo, Quan Liu, Zhigang Chen and Cong Liu", "abstract": "Zero-shot cross-lingual named entity recognition (NER) aims at transferring knowledge from annotated and rich-resource data in source languages to unlabeled and lean-resource data in target languages. Existing mainstream methods based on the teacher-student distillation framework ignore the rich and complementary information lying in the intermediate layers of pre-trained language models, and domain-invariant information is easily lost during transfer. In this study, a mixture of short-channel distillers (MSD) method is proposed to fully interact the rich hierarchical information in the teacher model and to transfer knowledge to the student model sufficiently and efficiently. Concretely, a multi-channel distillation framework is designed for sufficient information transfer by aggregating multiple distillers as a mixture. Besides, an unsupervised method adopting parallel domain adaptation is proposed to shorten the channels between the teacher and student models to preserve domain-invariant features. Experiments on four datasets across nine languages demonstrate that the proposed method achieves new state-of-the-art performance on zero-shot cross-lingual NER and shows great generalization and compatibility across languages and fields.", "track": "Information Extraction", "label": 5}, {"loc": [1.9687765836715698, 5.129182815551758], "id": 1288, "title": "An Efficient Memory-Augmented Transformer for Knowledge-Intensive NLP Tasks", "authors": "Yuxiang Wu, Yu Zhao, Baotian Hu, Pasquale Minervini, Pontus Stenetorp and Sebastian Riedel", "abstract": "Access to external knowledge is essential for many natural language processing tasks, such as question answering and dialogue. Existing methods often rely on a parametric model that stores knowledge in its parameters, or use a retrieval-augmented model that has access to an external knowledge source. Parametric and retrieval-augmented models have complementary strengths in terms of computational efficiency and predictive accuracy. To combine the strength of both approaches, we propose the Efficient Memory-Augmented Transformer (EMAT) \u2013 it encodes external knowledge into a key-value memory and exploits the fast maximum inner product search for memory querying. We also introduce pre-training tasks that allow EMAT to encode informative key-value representations, and to learn an implicit strategy to integrate multiple memory slots into the transformer. Experiments on various knowledge-intensive tasks such as question answering and dialogue datasets show that, simply augmenting parametric models (T5-base) using our method produces more accurate results (e.g., 25.8 \u2192 44.3 EM on NQ) while retaining a high throughput (e.g., 1000 queries/s on NQ). Compared to retrieval-augmented models, EMAT runs substantially faster across the board and produces more accurate results on WoW and ELI5.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.504973888397217, 11.903962135314941], "id": 1299, "title": "Supervised Prototypical Contrastive Learning for Emotion Recognition in Conversation", "authors": "Xiaohui Song, Longtao Huang, Hui Xue and Songlin Hu", "abstract": "Capturing emotions within a conversation plays an essential role in modern dialogue systems. However, the weak correlation between emotions and semantics brings many challenges to emotion recognition in conversation (ERC). Even semantically similar utterances, the emotion may vary drastically depending on contexts or speakers. In this paper, we propose a Supervised Prototypical Contrastive Learning (SPCL) loss for the ERC task. Leveraging the Prototypical Network, the SPCL targets at solving the imbalanced classification problem through contrastive learning and does not require a large batch size. Meanwhile, we design a difficulty measure function based on the distance between classes and introduce curriculum learning to alleviate the impact of extreme samples. We achieve state-of-the-art results on three widely used benchmarks. Further, we conduct analytical experiments to demonstrate the effectiveness of our proposed SPCL and curriculum learning strategy.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [6.958658695220947, 6.0187602043151855], "id": 1314, "title": "RuCoLA: Russian Corpus of Linguistic Acceptability", "authors": "Vladislav Mikhailov, Tatiana Shamardina, Max Ryabinin, Alena Pestova, Ivan Smurov and Ekaterina Artemova", "abstract": "Linguistic acceptability (LA) attracts the attention of the research community due to its many uses, such as testing the grammatical knowledge of language models and filtering implausible texts with acceptability classifiers.\nHowever, the application scope of LA in languages other than English is limited due to the lack of high-quality resources.\nTo this end, we introduce the Russian Corpus of Linguistic Acceptability (RuCoLA), built from the ground up under the well-established binary LA approach. \nRuCoLA consists of 9.8k in-domain sentences from linguistic publications and 3.6k out-of-domain sentences produced by generative models. The out-of-domain set is created to facilitate the practical use of acceptability for improving language generation.\nOur paper describes the data collection protocol and presents a fine-grained analysis of acceptability classification experiments with a range of baseline approaches.\nIn particular, we demonstrate that the most widely used language models still fall behind humans by a large margin, especially when detecting morphological and semantic errors. We release RuCoLA, the code of experiments, and a public leaderboard to assess the linguistic competence of language models for Russian.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.47761020064353943, 7.118537425994873], "id": 1315, "title": "Complex Hyperbolic Knowledge Graph Embeddings with Fast Fourier Transform", "authors": "Huiru Xiao, Xin Liu, Yangqiu Song, Ginny Y. Wong and Simon See", "abstract": "The choice of geometric space for knowledge graph (KG) embeddings can have significant effects on the performance of KG completion tasks. The hyperbolic geometry has been shown to capture the hierarchical patterns due to its tree-like metrics, which addressed the limitations of the Euclidean embedding models. Recent explorations of the complex hyperbolic geometry further improved the hyperbolic embeddings for capturing a variety of hierarchical structures. However, the performance of the hyperbolic KG embedding models for non-transitive relations is still unpromising, while the complex hyperbolic embeddings do not deal with multi-relations. This paper aims to utilize the representation capacity of the complex hyperbolic geometry in multi-relational KG embeddings. To apply the geometric transformations which account for different relations and the attention mechanism in the complex hyperbolic space, we propose to use the fast Fourier transform (FFT) as the conversion between the real and complex hyperbolic space. Constructing the attention-based transformations in the complex space is very challenging, while the proposed Fourier transform-based complex hyperbolic approaches provide a simple and effective solution. Experimental results show that our methods outperform the baselines, including the Euclidean and the real hyperbolic embedding models.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.597752571105957, 5.320528507232666], "id": 1331, "title": "Towards Knowledge-Intensive Text-to-SQL Semantic Parsing with Formulaic Knowledge", "authors": "Longxu Dou, Yan Gao, Xuqi Liu, Mingyang Pan, Dingzirui Wang, Wanxiang Che, Dechen Zhan, Min-Yen Kan and Jian-Guang LOU", "abstract": "In this paper, we study the problem of knowledge-intensive text-to-SQL, in which domain knowledge is necessary to parse expert questions into SQL queries over domain-specific tables. We formalize this scenario by building a new benchmark KnowSQL consisting of domain-specific questions covering various domains. We then address this problem by representing formulaic knowledge rather than by annotating additional data examples. More concretely, we construct a formulaic knowledge bank as a domain knowledge base and propose a framework (ReGrouP) to leverage this formulaic knowledge during parsing. Experiments using ReGrouP demonstrate a significant 28.2% improvement overall on KnowSQL.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.13363790512085, 5.944369792938232], "id": 1332, "title": "Should We Ban English NLP for a Year?", "authors": "Anders S\u00f8gaard", "abstract": "Around two thirds of NLP research at top venues is devoted exclusively to developing technology for speakers of English, most speech data comes from young urban speakers, and most texts used to train language models come from male writers. These biases feed into consumer technologies to widen existing inequality gaps, \nnot only within, but also across, societies. Many have argued that it is almost impossible to mitigate inequality amplification. I argue that, on the contrary, it is quite simple to do so, and that counter-measures would have little-to-no negative impact, except for, perhaps, in the very short term.", "track": "Ethics", "label": 21}, {"loc": [2.7405519485473633, 4.682299613952637], "id": 1341, "title": "LittleBird: Efficient Faster & Longer Transformer for Question Answering", "authors": "Minchul Lee, Kijong Han and Myeong Cheol Shin", "abstract": "BERT has shown a lot of sucess in a wide variety of NLP tasks. But it has a limitation dealing with long inputs due to its attention mechanism. Longformer, ETC and BigBird addressed this issue and effectively solved the quadratic dependency problem.\nHowever we find that these models are not sufficient, and propose LittleBird, a novel model based on BigBird with improved speed and memory footprint while maintaining accuracy.\nIn particular, we devise a more flexible and efficient position representation method based on Attention with Linear Biases(ALiBi). We also show that replacing the method of global information represented in the BigBird with pack and unpack attention is more effective.\nThe proposed model can work on long inputs even after being pre-trained on short inputs, and can be trained efficiently reusing existing pre-trained language model for short inputs. This is a significant benefit for low-resource languages where large amounts of long text data are difficult to obtain.\nAs a result, our experiments show that LittleBird works very well in a variety of languages, achieving high performance in question answering tasks, particularly in KorQuAD2.0, Korean Question Answering Dataset for long paragraphs.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [10.774970054626465, 7.004262447357178], "id": 1353, "title": "WeTS: A Benchmark for Translation Suggestion", "authors": "Zhen Yang, Fandong Meng, Yingxue Zhang, Ernan Li and Jie Zhou", "abstract": "Translation suggestion (TS), which provides alternatives for specific words or phrases given the entire documents generated by machine translation (MT), has been proven to play a significant role in post-editing (PE). There are two main pitfalls for existing researches in this line. First, most conventional works only focus on the overall performance of PE but ignore the exact performance of TS, which makes the progress of PE sluggish and less explainable; Second, as no publicly available golden dataset exists to support in-depth research for TS, almost all of the previous works conduct experiments on their in-house datasets or the noisy datasets built automatically, which makes their experiments hard to be reproduced and compared. To break these limitations mentioned above and spur the research in TS, we create a benchmark dataset, called \\emph{WeTS}, which is a golden corpus annotated by expert translators on four translation directions. Apart from the golden corpus, we also propose several methods to generate synthetic corpora which can be used to improve the performance substantially through pre-training. As for the model, we propose the segment-aware self-attention based Transformer for TS. Experimental results show that our approach achieves the best results on all four directions, including English-to-German, German-to-English, Chinese-to-English, and English-to-Chinese.\\footnote{For reviewers, codes and corpus can be found in the attached files, and we will make them publicly available after the double-blind phase.}", "track": "Machine Translation", "label": 10}, {"loc": [10.105525970458984, 7.711192607879639], "id": 1361, "title": "Discrete Cross-Modal Alignment Enables Zero-Shot Speech Translation", "authors": "Chen Wang, Yuchen Liu, Boxing Chen, Jiajun Zhang, Wei Luo, Zhongqiang Huang and Chengqing Zong", "abstract": "End-to-end Speech Translation (ST) aims at translating the source language speech into target language text without generating the intermediate transcriptions. However, the training of end-to-end methods relies on parallel ST data, which are difficult and expensive to obtain. Fortunately, the supervised data for automatic speech recognition (ASR) and machine translation (MT) are usually more accessible, making zero-shot speech translation a potential direction. Existing zero-shot methods fail to align the two modalities of speech and text into a shared semantic space, resulting in much worse performance compared to the supervised ST methods. In order to enable zero-shot ST, we propose a novel Discrete Cross-Modal Alignment (DCMA) method that employs a shared discrete vocabulary space to accommodate and match both modalities of speech and text. Specifically, we introduce a vector quantization module to discretize the continuous representations of speech and text into a finite set of virtual tokens, and use ASR data to map corresponding speech and text to the same virtual token in a shared codebook. This way, source language speech can be embedded in the same semantic space as the source language text, which can be then transformed into target language text with an MT module. Experiments on multiple language pairs demonstrate that our zero-shot ST method significantly improves the SOTA, and even performers on par with the strong supervised ST baselines.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.801358699798584, 10.016355514526367], "id": 1363, "title": "Abstractive Summarization Guided by Latent Hierarchical Document Structure", "authors": "Yifu Qiu and Shay B. Cohen", "abstract": "Sequential abstractive neural summarizers often do not use the underlying structure in the input article or dependencies between the input sentences. This structure is essential to integrate and consolidate information from different parts of the text. To address this shortcoming, we propose a hierarchy-aware graph neural network (HierGNN) which captures such dependencies through three main steps: 1) learning a hierarchical document structure through a latent structure tree learned by a sparse matrix-tree computation; 2) propagating sentence information over this structure using a novel message-passing node propagation mechanism to identify salient information; 3) using graph-level attention to concentrate the decoder on salient information. Experiments confirm HierGNN improves strong sequence models such as BART, with a 0.55 and 0.75 margin in average ROUGE-1/2/L for CNN/DM and XSum. Further human evaluation demonstrates that summaries produced by our model are more relevant and less redundant than the baselines, into which HierGNN is incorporated. We also find HierGNN synthesizes summaries by fusing multiple source sentences more, rather than compressing a single source sentence, and that it processes long inputs more effectively.", "track": "Summarization", "label": 14}, {"loc": [4.096625804901123, 4.348830223083496], "id": 1364, "title": "Explainable Question Answering based on Semantic Graph by Global Differentiable Learning and Dynamic Adaptive Reasoning", "authors": "Jianguo Mao, Wenbin Jiang, Xiangdong Wang, Hong Liu, Yu Xia, Yajuan Lyu and QiaoQiao She", "abstract": "Multi-hop Question Answering is an agent task for testing the reasoning ability. With the development of pre-trained models, the implicit reasoning ability has been surprisingly improved and can even surpass human performance. However, the nature of the black box hinders the construction of explainable intelligent systems. Several researchers have explored explainable neural-symbolic reasoning methods based on question decomposition techniques. The undifferentiable symbolic operations and the error propagation in the reasoning process lead to poor performance. To alleviate it, we propose a simple yet effective Global Differentiable Learning strategy to explore optimal reasoning paths from the latent probability space so that the model learns to solve intermediate reasoning processes without expert annotations. We further design a Dynamic Adaptive Reasoner to enhance the generalization of unseen questions. Our method achieves 17% improvements in F1-score against BreakRC and shows better interpretability. We take a step forward in building interpretable reasoning methods.", "track": "Question Answering", "label": 11}, {"loc": [2.132932186126709, 4.059805393218994], "id": 1368, "title": "DuReader-Retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine", "authors": "Yifu Qiu, Hongyu Li, Yingqi Qu, Ying Chen, QiaoQiao She, Jing Liu, Hua Wu and Haifeng Wang", "abstract": "In this paper, we present DuReader-retrieval, a large-scale Chinese dataset for passage retrieval. DuReader-retrieval contains more than 90K queries and over 8M unique passages from a commercial search engine. To alleviate the shortcomings of other datasets and ensure the quality of our benchmark, we (1) reduce the false negatives in development and test sets by manually annotating results pooled from multiple retrievers, and (2) remove the training queries that are semantically similar to the development and testing queries. Additionally, we provide two out-of-domain testing sets for cross-domain evaluation, as well as a set of human translated queries for for cross-lingual retrieval evaluation. The experiments demonstrate that DuReader-retrieval is challenging and a number of problems remain unsolved, such as the salient phrase mismatch and the syntactic mismatch between queries and paragraphs. These experiments also show that dense retrievers do not generalize well across domains, and cross-lingual retrieval is essentially challenging. DuReader-retrieval is publicly available at https://github.com/baidu/DuReader/tree/master/DuReader-Retrieval.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [0.6319910287857056, 7.908598899841309], "id": 1383, "title": "Pair-Based Joint Encoding with Relational Graph Convolutional Networks for Emotion-Cause Pair Extraction", "authors": "Junlong Liu, Xichen Shang and Qianli Ma", "abstract": "Emotion-cause pair extraction (ECPE) aims to extract emotion clauses and corresponding cause clauses, which have recently received growing attention. Previous methods sequentially encode features with a specified order. They first encode the emotion and cause features for clause extraction and then combine them for pair extraction. This lead to an imbalance in inter-task feature interaction where features extracted later have no direct contact with the former. To address this issue, we propose a novel **P**air-**B**ased **J**oint **E**ncoding (**PBJE**) network, which generates pairs and clauses features simultaneously in a joint feature encoding manner to model the causal relationship in clauses. PBJE can balance the information flow among emotion clauses, cause clauses and pairs. From a multi-relational perspective, we construct a heterogeneous undirected graph and apply the Relational Graph Convolutional Network (RGCN) to capture the multiplex relationship between clauses and the relationship between pairs and clauses. Experimental results show that PBJE achieves state-of-the-art performance on the Chinese benchmark corpus.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [1.0410996675491333, 10.52194595336914], "id": 1385, "title": "Affective Knowledge Enhanced Multiple-Graph Fusion Networks for Aspect-based Sentiment Analysis", "authors": "Siyu Tang, Heyan Chai, Ziyi Yao, Ye Ding, Cuiyun Gao, Binxing Fang and Qing Liao", "abstract": "Aspect-based sentiment analysis aims to identify sentiment polarity of social media users toward different aspects. Most recent methods adopt the aspect-centric latent tree to connect aspects and their corresponding opinion words, thinking that would facilitate establishing the relationship between aspects and opinion words.\nHowever, these methods ignore the roles of syntax dependency relation labels and affective semantic information in determining the sentiment polarity, resulting in the wrong prediction.\nIn this paper, we propose a novel multi-graph fusion network (MGFN) based on latent graph to leverage the richer syntax dependency relation label information and affective semantic information of words.\nSpecifically, we construct a novel syntax-aware latent graph (SaLG) to fully leverage the syntax dependency relation label information to facilitate the learning of sentiment representations. Subsequently, a multi-graph fusion module is proposed to fuse semantic information of surrounding contexts of aspects adaptively. Furthermore, we design an affective refinement strategy to guide the MGFN to capture significant affective clues. \nExtensive experiments on three datasets demonstrate that our MGFN model outperforms all state-of-the-art methods and verify the effectiveness of our model.", "track": "NLP Applications", "label": 0}, {"loc": [5.338454246520996, 8.778501510620117], "id": 1391, "title": "IndicNLG Benchmark: Multilingual Datasets for Diverse NLG Tasks in Indic Languages", "authors": "Aman Mr. Kumar, Himani Shrotriya, Prachi Sahu, Amogh Mishra, Raj Dabre, Ratish Puduppully, Anoop Kunchukuttan, Mitesh M. Khapra and Pratyush Kumar", "abstract": "Natural Language Generation (NLG) for non-English languages is hampered by the scarcity of datasets in these languages. We present the IndicNLG Benchmark, a collection of datasets for benchmarking NLG for 11 Indic languages. We focus on five diverse tasks, namely, biography generation using Wikipedia infoboxes, news headline generation, sentence summarization, paraphrase generation and, question generation. We describe the created datasets and use them to benchmark the performance of several monolingual and multilingual baselines that leverage pre-trained sequence-to-sequence models. Our results exhibit the strong performance of multilingual language-specific pre-trained models, and the utility of models trained on our dataset for other related NLG tasks. Our dataset creation methods can be easily applied to modest-resource languages as they involve simple steps such as scraping news articles and Wikipedia infoboxes, light cleaning, and pivoting through machine translation data. To the best of our knowledge, the IndicNLG Benchmark is the first NLG benchmark for Indic languages and the most diverse multilingual NLG dataset, with approximately 8M examples across 5 tasks and 11 languages. The datasets and models will be publicly available.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.711258888244629, 6.9316020011901855], "id": 1400, "title": "Improving Machine Translation with Phrase Pair Injection and Corpus Filtering", "authors": "Akshay Batheja and Pushpak Bhattacharyya", "abstract": "In this paper, we show that the combination of Phrase Pair Injection and Corpus Filtering boosts the performance of Neural Machine Translation (NMT) systems. We extract parallel phrases and sentences from the pseudo-parallel corpus and augment it with the parallel corpus to train the NMT models. With the proposed approach, we observe an improvement in the Machine Translation (MT) system for 3 low-resource language pairs, Hindi-Marathi, English-Marathi, and English-Pashto, and 6 translation directions by up to 2.7 BLEU points, on the FLORES test data. These BLEU score improvements are over the models trained using the whole pseudo-parallel corpus augmented with the parallel corpus.", "track": "Machine Translation", "label": 10}, {"loc": [5.844703674316406, 12.000345230102539], "id": 1405, "title": "An Anchor-based Relative Position Embedding Method for Cross-Modal Tasks", "authors": "Ya Wang, Xingwu Sun, Lian Fengzong, ZhanHui Kang and Chengzhong Xu Xu", "abstract": "Position Embedding (PE) is essential for transformer to capture the sequence ordering of input tokens. Despite its general effectiveness verified in Natural Language Processing (NLP) and Computer Vision (CV), its application in cross-modal tasks remains unexplored and suffers from two challenges: 1) the input text tokens and image patches are not aligned, 2) the encoding space of each modality is different, making it unavailable for feature comparison. In this paper, we propose a unified position embedding method for these problems, called AnChor-basEd Relative Position Embedding (ACE-RPE), in which we first introduce an anchor locating mechanism to bridge the semantic gap and locate anchors from different modalities. Then we conduct the distance calculation of each text token and image patch by computing their shortest paths from the located anchors. Last, we embed the anchor-based distance to guide the computation of cross-attention. In this way, it calculates cross-modal relative position embedding for cross-modal transformer. Benefiting from ACE-RPE, our method obtains new SOTA results on a wide range of benchmarks, such as Image-Text Retrieval on MS-COCO and Flickr30K, Visual Entailment on SNLI-VE, Visual Reasoning on NLVR2 and Weakly-supervised Visual Grounding on RefCOCO+.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.820908546447754, 6.834684371948242], "id": 1413, "title": "Norm-based Noisy Corpora Filtering and Refurbishing in Neural Machine Translation", "authors": "Yu Lu and Jiajun Zhang", "abstract": "Recent advances in neural machine translation depend on massive parallel corpora, which are collected from any open source without much guarantee of quality. It stresses the need for noisy corpora filtering, but existing methods are insufficient to solve this issue. They spend much time ensembling multiple scorers trained on clean bitexts, unavailable for low-resource languages in practice. In this paper, we propose a norm-based noisy corpora filtering and refurbishing method with no external data and costly scorers. The noisy and clean samples are separated based on how much information from the source and target sides the model requires to fit the given translation. For the unparallel sentence, the target-side history translation is much more important than the source context, contrary to the parallel ones. The amount of these two information flows can be measured by norms of source-/target-side context vectors. Moreover, we propose to reuse the discovered noisy data by generating pseudo labels via online knowledge distillation. Extensive experiments show that our proposed filtering method performs comparably with state-of-the-art noisy corpora filtering techniques but is more efficient and easier to operate. Noisy sample refurbishing further enhances the performance by making the most of the given data.", "track": "Machine Translation", "label": 10}, {"loc": [5.11588716506958, 8.898054122924805], "id": 1427, "title": "TeleMelody: Lyric-to-Melody Generation with a Template-Based Two-Stage Method", "authors": "Zeqian Ju, Peiling Lu, Xu Tan, Rui Wang, Chen Zhang, Songruoyao Wu, Kejun Zhang, Xiang-Yang Li, Tao Qin and Tie-Yan Liu", "abstract": "Lyric-to-melody generation is an important task in automatic songwriting. Previous lyric-to-melody generation systems usually adopt end-to-end models that directly generate melodies from lyrics, which suffer from several issues: 1) lack of paired lyric-melody training data; 2) lack of control on generated melodies. In this paper, we develop TeleMelody, a two-stage lyric-to-melody generation system with music template (e.g., tonality, chord progression, rhythm pattern, and cadence) to bridge the gap between lyrics and melodies (i.e., the system consists of a lyric-to-template module and a template-to-melody module). TeleMelody has two advantages. First, it is data efficient. The template-to-melody module is trained in a self-supervised way (i.e., the source template is extracted from the target melody) that does not need any lyric-melody paired data. The lyric-to-template module is made up of some rules and a lyric-to-rhythm model, which is trained with paired lyric-rhythm data that is easier to obtain than paired lyric-melody data. Second, it is controllable. The design of the template ensures that the generated melodies can be controlled by adjusting the musical elements in the template. Both subjective and objective experimental evaluations demonstrate that TeleMelody generates melodies with higher quality, better controllability, and less requirement on paired lyric-melody data than previous generation systems.", "track": "NLP Applications", "label": 0}, {"loc": [2.2984049320220947, 7.347108364105225], "id": 1429, "title": "SEEN: Structured Event Enhancement Network for Explainable Need Detection of Information Recall Assistance", "authors": "You-En Lin, An-Zi Yen, Hen-Hsen Huang and Hsin-Hsi Chen", "abstract": "When recalling life experiences, people often forget or confuse life events, which necessitates information recall services. Previous work on information recall focuses on providing such assistance reactively, i.e., by retrieving the life event of a given query. Proactively detecting the need for information recall services is rarely discussed. In this paper, we use a human-annotated life experience retelling dataset to detect the right time to trigger the information recall service. We propose a pilot model\u2014structured event enhancement network (SEEN) that detects life event inconsistency, additional information in life events, and forgotten events. A fusing mechanism is also proposed to incorporate event graphs of stories and enhance the textual representations. To explain the need detection results, SEEN simultaneously provides support evidence by selecting the related nodes from the event graph. Experimental results show that SEEN achieves promising performance in detecting information needs. In addition, the extracted evidence can be served as complementary information to remind users what events they may want to recall.", "track": "NLP Applications", "label": 0}, {"loc": [5.914653301239014, 9.159542083740234], "id": 1449, "title": "Rethinking Style Transformer with Energy-based Interpretation: Adversarial Unsupervised Style Transfer using a Pretrained Model", "authors": "Hojun Cho, Dohee Kim, Seungwoo Ryu, ChaeHun Park, Hyungjong Noh, Jeong-in Hwang, Minseok Choi, Edward Choi and Jaegul Choo", "abstract": "Style control, content preservation, and fluency determine the quality of text style transfer models. To train on a nonparallel corpus, several existing approaches aim to deceive the style discriminator with an adversarial loss. However, adversarial training significantly degrades fluency compared to the other two metrics. In this work, we explain this phenomenon using energy-based interpretation, and leverage a pretrained language model to improve fluency. Specifically, we propose a novel approach which applies the pretrained language model to the text style transfer framework by restructuring the discriminator and the model itself, allowing the generator and the discriminator to also take advantage of the power of the pretrained model. We evaluated our model on three public benchmarks GYAFC, Amazon, and Yelp and achieved state-of-the-art performance on the overall metrics.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [10.497294425964355, 7.165591239929199], "id": 1454, "title": "Towards Robust k-Nearest-Neighbor Machine Translation", "authors": "Hui Jiang, Ziyao Lu, Fandong Meng, Chulun Zhou, Jie Zhou, Degen Huang and Jinsong Su", "abstract": "k-Nearest-Neighbor Machine Translation (kNN-MT) becomes an important research direction of NMT in recent years. Its main idea is to retrieve useful key-value pairs from an additional datastore to modify translations without updating the NMT model. However, the underlying retrieved noisy pairs will dramatically deteriorate the model performance. In this paper, we conduct a preliminary study and find that this problem results from not fully exploiting the prediction of the NMT model. To alleviate the impact of noise, we propose a confidence-enhanced kNN-MT model with robust training. Concretely, we introduce the NMT confidence to refine the modeling of two important components of kNN-MT: kNN distribution and the interpolation weight. Meanwhile we inject two types of perturbations into the retrieved pairs for robust training. Experimental results on four benchmark datasets demonstrate that our model not only achieves significant improvements over current kNN-MT models, but also exhibits better robustness. Our code is available at https://github.com/DeepLearnXMU/Robust-knn-mt.", "track": "Machine Translation", "label": 10}, {"loc": [2.9545516967773438, 6.068011283874512], "id": 1456, "title": "Tiny-NewsRec: Effective and Efficient PLM-based News Recommendation", "authors": "Yang Yu, Fangzhao Wu, Chuhan Wu, Jingwei Yi and Qi Liu", "abstract": "News recommendation is a widely adopted technique to provide personalized news feeds for the user. Recently, pre-trained language models (PLMs) have demonstrated the great capability of natural language understanding and benefited news recommendation via improving news modeling. However, most existing works simply finetune the PLM with the news recommendation task, which may suffer from the known domain shift problem between the pre-training corpus and downstream news texts. Moreover, PLMs usually contain a large volume of parameters and have high computational overhead, which imposes a great burden on low-latency online services. In this paper, we propose Tiny-NewsRec, which can improve both the effectiveness and the efficiency of PLM-based news recommendation. We first design a self-supervised domain-specific post-training method to better adapt the general PLM to the news domain with a contrastive matching task between news titles and news bodies. We further propose a two-stage knowledge distillation method to improve the efficiency of the large PLM-based news recommendation model while maintaining its performance. Multiple teacher models originated from different time steps of our post-training procedure are used to transfer comprehensive knowledge to the student model in both its post-training stage and finetuning stage. Extensive experiments on two real-world datasets validate the effectiveness and efficiency of our method.", "track": "NLP Applications", "label": 0}, {"loc": [7.633563995361328, 3.6134538650512695], "id": 1461, "title": "TABS: Efficient Textual Adversarial Attack for Pre-trained NL Code Model Using Semantic Beam Search", "authors": "YunSeok Choi, Hyojun Kim and Jee-Hyong Lee", "abstract": "As pre-trained models have shown successful performance in program language processing as well as natural language processing, adversarial attacks on these models also attract attention.\nHowever, previous works on black-box adversarial attacks generated adversarial examples in a very inefficient way with simple greedy search. \nThey also failed to find out better adversarial examples because it was hard to reduce the search space without performance loss.\nIn this paper, we propose TABS, an efficient beam search black-box adversarial attack method. \nWe adopt beam search to find out better adversarial examples, and contextual semantic filtering to effectively reduce the search space. \nContextual semantic filtering reduces the number of candidate adversarial words considering the surrounding context and the semantic similarity.\nOur proposed method shows good performance in terms of attack success rate, the number of queries, and semantic similarity in attacking models for two tasks: NL code search classification and retrieval tasks.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.664431095123291, 4.674422264099121], "id": 1464, "title": "Investigating the Robustness of Natural Language Generation from Logical Forms via Counterfactual Samples", "authors": "Chengyuan Liu, Leilei Gan, Kun Kuang and Fei Wu", "abstract": "The aim of Logic2Text is to generate controllable and faithful texts conditioned on tables and logical forms, which not only requires a deep understanding of the tables and logical forms, but also warrants symbolic reasoning over the tables according to the logical forms. State-of-the-art methods based on pre-trained models have achieved remarkable performance on the standard test dataset. However, we question whether these methods really learn how to perform logical reasoning, rather than just relying on the spurious correlations between the headers of the tables and operators of the logical form. To verify this hypothesis, we manually construct a set of counterfactual samples, which modify the original logical forms to generate counterfactual logical forms with rare co-occurred headers and operators and corresponding counterfactual references. SOTA methods give much worse results on these counterfactual samples compared with the results on the original test dataset, which verifies our hypothesis. To deal with this problem, we firstly analyze this bias from a causal perspective, based on which we propose two approaches to reduce the model's reliance on the shortcut. The first one incorporates the hierarchical structure of the logical forms into the model. The second one exploits automatically generated counterfactual data for training. Automatic and manual experimental results on the original test dataset and counterfactual dataset show that our method is effective to alleviate the spurious correlation. Our work points out the weakness of current methods and takes a further step toward developing Logic2Text models with real logical reasoning ability.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.582795143127441, 7.6640706062316895], "id": 1482, "title": "Helping the Weak Makes You Strong: Simple Multi-Task Learning Improves Non-Autoregressive Translators", "authors": "Xinyou Wang, Zaixiang Zheng and Shujian Huang", "abstract": "Recently, non-autoregressive (NAR) neural machine translation models have received increasing attention due to their efficient parallel decoding.\nHowever, the probabilistic framework of NAR models necessitates conditional independence assumption on target sequences, falling short of characterizing human language data.\nThis drawback results in less informative learning signals for NAR models under conventional MLE training, thereby yielding unsatisfactory accuracy compared to their autoregressive (AR) counterparts.\nIn this paper, we propose a simple and model-agnostic multi-task learning framework to provide more informative learning signals.\nDuring training stage, we introduce a set of sufficiently weak AR decoders that solely rely on the information provided by NAR decoder to make prediction, forcing the NAR decoder to become stronger or else it will be unable to support its weak AR partners.\nExperiments on WMT and IWSLT datasets show that our approach can consistently improve accuracy of multiple NAR baselines without adding any additional decoding overhead.", "track": "Machine Translation", "label": 10}, {"loc": [6.490644454956055, 1.888877034187317], "id": 1483, "title": "RACE: Retrieval-augmented Commit Message Generation", "authors": "Ensheng Shi, Yanlin Wang, Wei Tao, Lun Du, hongyu Zhang, Shi Han, Dongmei Zhang and Hongbin Sun", "abstract": "Commit messages are important for software development and maintenance. Many neural network-based approaches have been proposed and shown promising results on automatic commit message generation. However, the generated commit messages could be repetitive or redundant. In this paper, we propose RACE, a new retrieval-augmented neural commit message generation method, which treats the retrieved similar commit as an exemplar and leverages it to generate an accurate commit message. As the retrieved commit message may not always accurately describe the content/intent of the current code diff, we also propose an exemplar guider, which learns the semantic similarity between the retrieved and current code diff and then guides the generation of commit message based on the similarity. We conduct extensive experiments on a large public dataset with five programming languages. Experimental results show that RACE can outperform all baselines. Furthermore, RACE can boost the performance of existing Seq2Seq models in commit message generation.", "track": "Summarization", "label": 14}, {"loc": [4.323120594024658, 4.855645656585693], "id": 1487, "title": "PLOG: Table-to-Logic Pretraining for Logical Table-to-Text Generation", "authors": "Ao Liu, Haoyu Dong, Naoaki Okazaki, Shi Han and Dongmei Zhang", "abstract": "Logical table-to-text generation is a task that involves generating logically faithful sentences from tables, which requires models to derive logical-level facts from table records via logical inference. It raises a new challenge on the logical-level content planning of table-to-text models. However, directly learning the logical inference knowledge from table-text pairs is very difficult for neural models because of the ambiguity of natural language and the scarcity of parallel data. Hence even large-scale pre-trained language models present low logical fidelity on logical table-to-text. In this work, we propose a Pretrained Logical Form Generator (PLOG) framework to improve generation fidelity. Specifically, PLOG is first pretrained on a table-to-logical-form generation (table-to-logic) task, then finetuned on downstream table-to-text tasks. The logical forms are formally defined with unambiguous semantics. Hence we can collect a large amount of accurate logical forms from tables without human annotation. In addition, PLOG can learn logical inference from table-logic pairs much more reliably than from table-text pairs. To evaluate our model, we further collect a controlled logical table-to-text dataset CONTLOG based on an existing dataset. On two benchmarks, LOGICNLG and CONTLOG, PLOG outperforms strong baselines by a large margin on the logical fidelity, demonstrating the effectiveness of table-to-logic pretraining.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.797091484069824, 11.815139770507812], "id": 1488, "title": "GHAN: Graph-Based Hierarchical Aggregation Network for Text-Video Retrieval", "authors": "Yahan Yu, Bojie Hu and Yu Li", "abstract": "Text-video retrieval focuses on two aspects: cross-modality interaction and video-language encoding. Currently, the mainstream approach is to train a joint embedding space for multimodal interactions. However, there are structural and semantic differences between text and video, making this approach challenging for fine-grained understanding. In order to solve this, we propose an end-to-end graph-based hierarchical aggregation network for text-video retrieval according to the hierarchy possessed by text and video. We design a token-level weighted network to refine intra-modality representations and construct a graph-based message passing attention network for global-local alignment across modality. We conduct experiments on the public datasets MSR-VTT-9K, MSR-VTT-7K and MSVD, and achieve Recall@1 of 73.0\\%, 65.6\\%, and 64.0\\% , which is 25.7\\%, 16.5\\%, and 14.2\\% better than the current state-of-the-art model.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.397482872009277, 12.171443939208984], "id": 1503, "title": "MuRAG: Multimodal Retrieval-Augmented Generator for Open Question Answering over Images and Text", "authors": "Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga and William Cohen", "abstract": "While language Models store a massive amount of world knowledge implicitly in their parameters, even very large models often fail to encode information about rare entities and events, while incurring huge computational costs. Recently, retrieval-augmented models, such as REALM, RAG, and RETRO, have incorporated world knowledge into language generation by leveraging an external non-parametric index and have demonstrated impressive performance with constrained model sizes. However, these methods are restricted to retrieving only textual knowledge, neglecting the ubiquitous amount of knowledge in other modalities like images -- much of which contains information not covered by any text. To address this limitation, we propose the first Multimodal Retrieval-Augmented Transformer (MuRAG), which accesses an external non-parametric multimodal memory to augment language generation. MuRAG is pre-trained with a mixture of large-scale image-text and text-only corpora using a joint contrastive and generative loss. We perform experiments on two different datasets that require retrieving and reasoning over both images and text to answer a given query: WebQA, and MultimodalQA. Our results show that MuRAG achieves state-of-the-art accuracy, outperforming existing models by 10-20\\% absolute on both datasets and under both distractor and full-wiki settings.", "track": "Ethic Concerns:Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [2.5010602474212646, 8.627835273742676], "id": 1507, "title": "PHEE: A Dataset for Pharmacovigilance Event Extraction from Text", "authors": "Zhaoyue Sun, Jiazheng Li, Gabriele Pergola, Byron C. Wallace, Bino John, Nigel Greene, Joseph Kim and Yulan He", "abstract": "The primary goal of drug safety researchers and regulators is to promptly identify adverse drug reactions. Doing so may in turn prevent or reduce the harm to patients and ultimately improve public health. Evaluating and monitoring drug safety (i.e., pharmacovigilance) involves analyzing an ever growing collection of spontaneous reports from health professionals, physicians, and pharmacists, and information voluntarily submitted by patients. In this scenario, facilitating analysis of such reports via automation has the potential to rapidly identify safety signals. Unfortunately, public resources for developing natural language models for this task are scant. We present PHEE, a novel dataset for pharmacovigilance comprising over 5000 annotated events from medical case reports and biomedical literature, making it the largest such public dataset to date. We describe the hierarchical event schema designed to provide coarse and fine-grained information about patients' demographics, treatments and (side) effects. Along with the discussion of the dataset, we present a thorough experimental evaluation of current state-of-the-art approaches for biomedical event extraction, point out their limitations, and highlight open challenges to foster future research in this area.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.292745590209961, 9.742696762084961], "id": 1515, "title": "OTSeq2Set: An Optimal Transport Enhanced Sequence-to-Set Model for Extreme Multi-label Text Classification", "authors": "Jie Cao and Yin Zhang", "abstract": "Extreme multi-label text classification (XMTC) is the task of finding the most relevant subset labels from an extremely large-scale label collection. Recently, some deep learning models have achieved state-of-the-art results in XMTC tasks. These models commonly predict scores for all labels by a fully connected layer as the last layer of the model. However, such models can't predict a relatively complete and variable-length label subset for each document, because they select positive labels relevant to the document by a fixed threshold or take top k labels in descending order of scores. A less popular type of deep learning models called sequence-to-sequence (Seq2Seq) focus on predicting variable-length positive labels in sequence style. However, the labels in XMTC tasks are essentially an unordered set rather than an ordered sequence, the default order of labels restrains Seq2Seq models in training. To address this limitation in Seq2Seq, we propose an autoregressive sequence-to-set model for XMTC tasks named OTSeq2Set. Our model generates predictions in student-forcing scheme and is trained by a loss function based on bipartite matching which enables permutation-invariance. Meanwhile, we use the optimal transport distance as a measurement to force the model to focus on the closest labels in semantic label space. Experiments show that OTSeq2Set outperforms other competitive baselines on 4 benchmark datasets. Especially, on the Wikipedia dataset with 31k labels, it outperforms the state-of-the-art Seq2Seq method by 16.34% in micro-F1 score. The code is available at https://github.com/caojie54/OTSeq2Set.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [10.728055953979492, 7.013880729675293], "id": 1526, "title": "SimQA: Detecting Simultaneous MT Errors through Word-by-Word Question Answering", "authors": "HyoJung Han, Marine Carpuat and Jordan Boyd-Graber", "abstract": "Detractors of neural machine translation admit that while its translations are fluent, it sometimes gets key facts wrong.\nThis is particularly important in simultaneous interpretation where translations have to be provided as fast as possible: before a sentence is complete.\nYet, evaluations of simultaneous machine translation (SimulMT) fail to capture if systems correctly translate the most salient elements of a question: people, places, and dates.\nTo address this problem, we introduce a downstream word-by-word question answering evaluation task (SimQA): given a source language question, translate the question word by word into the target language word by word, and answer as soon as possible.\nSimQA jointly measures whether the SimulMT models translates the question quickly and accurately, and can reveal shortcomings in existing neural systems\u2014hallucinating or omitting facts.", "track": "Machine Translation", "label": 10}, {"loc": [8.89405345916748, 6.106393814086914], "id": 1533, "title": "Discovering Low-rank Subspaces for Language-agnostic Multilingual Representations", "authors": "Zhihui Xie, Handong Zhao, Tong Yu and Shuai Li", "abstract": "Large pretrained multilingual language models (ML-LMs) have shown remarkable capabilities of zero-shot cross-lingual transfer, without direct cross-lingual supervision. While these results are promising, follow-up works found that, within the multilingual embedding spaces, there exists strong language identity information which hinders the expression of linguistic factors shared across languages. For semantic tasks like cross-lingual sentence retrieval, it is desired to remove such language identity signals to fully leverage semantic information. In this work, we provide a novel view of projecting away language-specific factors from a multilingual embedding space. Specifically, we discover that there exists a low-rank subspace that primarily encodes information irrelevant to semantics (e.g., syntactic information). To identify this subspace, we present a simple but effective unsupervised method based on singular value decomposition with multiple monolingual corpora as input. Once the subspace is found, we can directly project the original embeddings into the null space to boost language agnosticism without finetuning. We systematically evaluate our method on various tasks including the challenging language-agnostic QA retrieval task. Empirical results show that applying our method consistently leads to improvements over commonly used ML-LMs.", "track": "Multilinguality", "label": 13}, {"loc": [7.073671340942383, 4.818701267242432], "id": 1535, "title": "Rethinking the Authorship Verification Experimental Setups", "authors": "Florin Brad, Andrei Manolache, Elena Burceanu, Antonio Barbalau, Radu Tudor Ionescu and Marius Popescu", "abstract": "One of the main drivers of the recent advances in authorship verification is the PAN large-scale authorship dataset. Despite generating significant progress in the field, inconsistent performance differences between the closed and open test sets have been reported. To this end, we improve the experimental setup by proposing five new public splits over the PAN dataset, specifically designed to isolate and identify biases related to the text topic and to the author's writing style. We evaluate several BERT-like baselines on these splits, showing that such models are competitive with authorship verification state-of-the-art methods. Furthermore, using explainable AI, we find that these baselines are biased towards named entities. We show that models trained without the named entities obtain better results and generalize better when tested on DarkReddit, our new dataset for authorship verification.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.403482437133789, 12.260196685791016], "id": 1551, "title": "Borrowing Human Senses: Comment-Aware Self-Training for Social Media Multimodal Classification", "authors": "Chunpu Xu and Jing Li", "abstract": "Social media is daily creating massive multimedia content with paired image and text, presenting the pressing need to automate the vision and language understanding for various multimodal classification tasks. Compared to the commonly researched visual-lingual data, social media posts tend to exhibit more implicit image-text relations. To better glue the cross-modal semantics therein, we capture hinting features from user comments, which are retrieved via jointly leveraging visual and lingual similarity. Afterwards, the classification tasks are explored via self-training in a teacher-student framework, motivated by the usually limited labeled data scales in existing benchmarks. Substantial experiments are conducted on four multimodal social media benchmarks for image-text relation classification, sarcasm detection, sentiment classification, and hate speech detection. The results show that our method further advances the performance of previous state-of-the-art models, which do not employ comment modeling or self-training.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [8.798788070678711, 8.14393138885498], "id": 1552, "title": "Training Language Models with Memory Augmentation", "authors": "Zexuan Zhong, Tao Lei and Danqi Chen", "abstract": "Recent work has improved language models (LMs) remarkably by equipping them with a non-parametric memory component. However, most existing approaches only introduce mem-ories at testing time or represent them using a separately trained encoder, resulting in suboptimal training of the language model. In this work, we present TRIME, a novel yet simple training approach designed for training LMs with memory augmentation. Our approach uses a training objective that directly takes in-batch examples as accessible memory. We also present new methods for memory construction and data batching, which are used for adapting to different sets of memories\u2014local, long-term, and external memory\u2014at testing time. We evaluate TRIME on multiple language modeling and machine translation benchmarks and show that it is able to achieve significant improvements across all the settings. Concretely, TRIME reduces the perplexity from 18.70 to 15.37 on WIKITEXT-103, by effectively leveraging a large memory set from the training corpus. Compared to standard LM training, TRIME adds negligible computational overhead and is compatible with different neural architectures, making it a versatile solution for training memory-augmented LMs.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.589925765991211, 12.355353355407715], "id": 1560, "title": "Data-Efficient Strategies for Expanding Hate Speech Detection into Under-Resourced Languages", "authors": "Paul R\u00f6ttger, Debora Nozza, Federico Bianchi and Dirk Hovy", "abstract": "Hate speech is a global phenomenon, but most hate speech datasets so far focus on English-language content. This hinders the development of more effective hate speech detection models in hundreds of languages spoken by billions across the world. More data is needed, but annotating hateful content is expensive, time-consuming and potentially harmful to annotators. To mitigate these issues, we explore data-efficient strategies for expanding hate speech detection into under-resourced languages. In a series of experiments with mono- and multilingual models across five non-English languages, we find that 1) a small amount of target-language fine-tuning data is needed to achieve strong performance, 2) the benefits of using more such data decrease exponentially, and 3) initial fine-tuning on readily-available English data can partially substitute target-language data and improve model generalisability. Based on these findings, we formulate actionable recommendations for hate speech detection in low-resource language settings.", "track": "Multilinguality", "label": 13}, {"loc": [1.7521641254425049, 3.8399999141693115], "id": 1570, "title": "Dimension Reduction for Efficient Dense Retrieval via Conditional Autoencoder", "authors": "Zhenghao Liu, Han Zhang, Chenyan Xiong, Zhiyuan Liu, Yu Gu and Xiaohua Li", "abstract": "Dense retrievers encode queries and documents and map them in an embedding space using pre-trained language models. These embeddings need to be high-dimensional to fit training signals and guarantee the retrieval effectiveness of dense retrievers. However, these high-dimensional embeddings lead to larger index storage and higher retrieval latency. To reduce the embedding dimensions of dense retrieval, this paper proposes a Conditional Autoencoder (ConAE) to compress the high-dimensional embeddings to maintain the same embedding distribution and better recover the ranking features. Our experiments show that ConAE is effective in compressing embeddings by achieving comparable ranking performance with its teacher model and making the retrieval system more efficient. Our further analyses show that ConAE can alleviate the redundancy of the embeddings of dense retrieval with only one linear layer. All codes of this work are available at https://github.com/NEUIR/ConAE.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.7965452671051025, 9.869338989257812], "id": 1584, "title": "Controlled Text Reduction", "authors": "Aviv Slobodkin, Paul Roit, Eran Hirsch, Ori Ernst and Ido Dagan", "abstract": "Producing a reduced version of a source text, as in generic or focused summarization, inherently involves two distinct subtasks: deciding on targeted content and generating a coherent text conveying it. While some popular approaches address summarization as a single end-to-end task, prominent works support decomposed modeling for individual subtasks. \nFurther, semi-automated text reduction is also very appealing, where users may identify targeted content while models would generate a corresponding coherent summary.\nIn this paper, we focus on the second subtask, of generating coherent text given pre-selected content. Concretely, we formalize \\textit{Controlled Text Reduction} as a standalone task, whose input is a source text with marked spans of targeted content (\"highlighting\").\nA model then needs to generate a coherent text that includes all and only the target information.\nWe advocate the potential of such models, both for modular fully-automatic summarization, as well as for semi-automated human-in-the-loop use cases.\nFacilitating proper research, we crowdsource high-quality dev and test datasets for the task. Further, we automatically generate a larger \"silver\" training dataset from available summarization benchmarks, leveraging a pretrained summary-source alignment model.\nFinally, employing these datasets, we present a supervised baseline model, showing promising results and insightful analyses.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.7452144622802734, 9.577372550964355], "id": 1589, "title": "Questioning the Validity of Summarization Datasets and Improving Their Factual Consistency", "authors": "Yanzhu Guo, Chlo\u00e9 Clavel, Moussa Kamal Eddine and michalis vazirgiannis", "abstract": "The topic of summarization evaluation has recently attracted a surge of attention due to the rapid development of abstractive summarization systems. However, the formulation of the task is rather ambiguous, neither the linguistic nor the natural language processing communities have succeeded in giving a mutually agreed-upon definition. Due to this lack of well-defined formulation, a large number of popular abstractive summarization datasets are constructed in a manner that neither guarantees validity nor meets one of the most essential criteria of summarization: factual consistency. In this paper, we address this issue by combining state-of-the-art factual consistency models to identify the problematic instances present in popular summarization datasets. We release SummFC, a filtered summarization dataset with improved factual consistency, and demonstrate that models trained on this dataset achieve improved performance in nearly all quality aspects. We argue that our dataset should become a valid benchmark for developing and evaluating summarization systems.", "track": "Theme Track", "label": 18}, {"loc": [8.130956649780273, 8.082945823669434], "id": 1591, "title": "Invariant Language Modeling", "authors": "Maxime Peyrard, Sarvjeet Singh Ghotra, Martin Josifoski, Vidhan Agarwal, Barun Patra, Dean Carignan, Emre Kiciman, Saurabh Tiwary and Robert West", "abstract": "Modern pretrained language models are critical components of NLP pipelines. \nYet, they suffer from spurious correlations, poor out-of-domain generalization, and biases.\nInspired by recent progress in causal machine learning, in particular the invariant risk minimization (IRM) paradigm, we propose invariant language modeling, a framework for learning invariant representations that generalize better across multiple environments. In particular, we adapt a game-theoretic implementation of IRM (IRM-games) to language models, where the invariance emerges from a specific training schedule in which all the environments compete to optimize their own environment-specific loss by updating subsets of the model in a round-robin fashion.\nWe focused on controlled experiments to precisely demonstrate the ability of our method to (i) remove structured noise, (ii) ignore specific spurious correlations without affecting global performance, and (iii) achieve better out-of-domain generalization.\nThese benefits come with a negligible computational overhead compared to standard training, do not require changing the local loss, and can be applied to any language model. We believe this framework is promising to help mitigate spurious correlations and biases in language models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [8.700105667114258, 8.321676254272461], "id": 1609, "title": "AdaMix: Mixture-of-Adaptations for Parameter-efficient Model Tuning", "authors": "Yaqing Wang, Sahaj Agarwal, Subhabrata Mukherjee, Xiaodong Liu, Jing Gao, Ahmed Hassan Awadallah and Jianfeng Gao", "abstract": "Standard fine-tuning of large pre-trained language models (PLMs) for downstream tasks requires updating hundreds of millions to billions of parameters, and storing a large copy of the PLM weights for every task resulting in increased cost for storing, sharing and serving the models. To address this, parameter-efficient fine-tuning (PEFT) techniques were introduced where small trainable components are injected in the PLM and updated during fine-tuning. We propose AdaMix as a general PEFT method that tunes a mixture of adaptation modules -- given the underlying PEFT method of choice -- introduced in each Transformer layer while keeping most of the PLM weights frozen. For instance, AdaMix can leverage a mixture of adapters like Houlsby or a mixture of low rank decomposition matrices like LoRA to improve downstream task performance over the corresponding PEFT methods for fully supervised and few-shot NLU and NLG tasks. Further, we design AdaMix such that it matches the same computational cost and the number of tunable parameters as the underlying PEFT method. By only tuning 0.1-0.2% of PLM parameters, we show that AdaMix outperforms SOTA parameter-efficient fine-tuning and full model fine-tuning for both NLU and NLG tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.728872299194336, 9.557570457458496], "id": 1617, "title": "How \"Multi\" is Multi-Document Summarization?", "authors": "Ruben Wolhandler, Arie Cattan, Ori Ernst and Ido Dagan", "abstract": "The task of multi-document summarization (MDS) aims at models that, given multiple documents as input, are able to generate a summary that combines disperse information, originally spread __across__ these documents. Accordingly, it is expected that both reference summaries in MDS datasets, as well as system summaries, would indeed be based on such dispersed information. In this paper, we argue for quantifying and assessing this expectation. To that end, we propose an automated measure for evaluating the degree to which a summary is ``disperse'', in the sense of the number of source documents needed to cover its content. We apply our measure to empirically analyze several popular MDS datasets, with respect to their reference summaries, as well as the output of state-of-the-art systems. Our results show that certain MDS datasets barely require combining information from multiple documents, where a single document often covers the full summary content. Overall, we advocate using our metric for assessing and improving the degree to which summarization datasets require combining multi-document information, and similarly how summarization models actually meet this challenge.", "track": "Summarization", "label": 14}, {"loc": [2.5044491291046143, 8.651063919067383], "id": 1620, "title": "BioReader: a Retrieval-Enhanced Text-to-Text Transformer for Biomedical Literature", "authors": "Giacomo Frisoni, Miki Mizutani, Gianluca Moro and Lorenzo Valgimigli", "abstract": "The latest batch of research has equipped language models with the ability to attend over relevant and factual information from non-parametric external sources, drawing a complementary path to architectural scaling. Besides mastering language, exploiting and contextualizing the latent world knowledge is crucial in complex domains like biomedicine. However, most works in the field rely on general-purpose models supported by databases like Wikipedia and Books. We introduce BioReader, the first retrieval-enhanced text-to-text model for biomedical natural language processing. Our domain-specific T5-based solution augments the input prompt by fetching and assembling relevant scientific literature chunks from a neural database with \u224860 million tokens centered on PubMed. We fine-tune and evaluate BioReader on a broad array of downstream tasks, significantly outperforming several state-of-the-art methods despite using up to 3x fewer parameters. In tandem with extensive ablation studies, we show that domain knowledge can be easily altered or supplemented to make the model generate correct predictions bypassing the retraining step and thus addressing the literature overload issue.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.06582260131836, 7.613828182220459], "id": 1626, "title": "T-Modules: Translation Modules for Zero-Shot Cross-Modal Machine Translation", "authors": "Paul-Ambroise Duquenne, Hongyu Gong, Beno\u00eet Sagot and Holger Schwenk", "abstract": "We present a new approach to perform zero-shot cross-modal transfer between speech and text for translation tasks. Multilingual speech and text are encoded in a joint fixed-size representation space. Then, we compare different approaches to decode these multimodal and multilingual fixed-size representations, enabling zero-shot translation between languages and modalities. All our models are trained without the need of cross-modal labeled translation data.\nDespite a fixed-size representation, we achieve very competitive results on several text and speech translation tasks. In particular, we significantly improve the state-of-the-art for zero-shot speech translation on Must-C. Incorporating a speech decoder in our framework, we introduce the first results for zero-shot direct speech-to-speech and text-to-speech translation.", "track": "Multilinguality", "label": 13}, {"loc": [3.9833481311798096, 4.02597188949585], "id": 1638, "title": "LILA: A Unified Benchmark for Mathematical Reasoning", "authors": "Swaroop Mishra, Matthew Finlayson, Pan Lu, Leonard Tang, Sean Welleck, Chitta Baral, Tanmay Rajpurohit, Oyvind Tafjord, Ashish Sabharwal, Peter Clark and Ashwin K. Kalyan", "abstract": "Mathematical reasoning skills are essential for general-purpose intelligent\nsystems to perform tasks from grocery shopping to climate modeling.\nTowards evaluating and improving AI systems in this domain, we propose\nLILA, a unified mathematical reasoning benchmark consisting of 23 diverse\ntasks along four dimensions:\n(i) mathematical abilities e.g., arithmetic, calculus \n(ii) language format e.g., question-answering, fill-in-the-blanks \n(iii) language diversity e.g., no language, simple language \n(iv) external knowledge e.g., commonsense, physics. \nWe construct our benchmark by extending 20 datasets benchmark \nby collecting task instructions and solutions in the form of Python programs,\nthereby obtaining explainable solutions \nin addition to the correct answer.\nWe additionally introduce two evaluation datasets \nto measure out-of-distribution performance \nand robustness to language perturbation.\nFinally, we introduce BHASKARA,\na general-purpose mathematical reasoning model trained on LILA. \nImportantly, we find that multi-tasking leads to significant improvements \n(average relative improvement of 21.83% F1 score vs. single-task models),\nwhile the best performing model only obtains 60.40%,\nindicating the room for improvement \nin general mathematical reasoning and understanding.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.398414134979248, 4.489282131195068], "id": 1639, "title": "Leveraging Affirmative Interpretations from Negation Improves Natural Language Understanding", "authors": "Md Mosharaf Hossain and Eduardo Blanco", "abstract": "Negation poses a challenge in many natural language understanding tasks. Inspired by the fact that understanding a negated statement often requires humans to infer affirmative interpretations, in this paper we show that doing so benefits models for three natural language understanding tasks. We present an automated procedure to collect pairs of sentences with negation and their affirmative interpretations, resulting in over 150,000 pairs. Experimental results show that leveraging these pairs helps (a) T5 generate affirmative interpretations from negations in a previous benchmark, and (b) a RoBERTa-based classifier solve the task of natural language inference. We also leverage our pairs to build a plug-and-play neural generator that given a negated statement generates an affirmative interpretation. Then, we incorporate the pretrained generator into a RoBERTa-based classifier for sentiment analysis and show that doing so improves the results. Crucially, our proposal does not require any manual effort.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [1.6336257457733154, 5.329700946807861], "id": 1648, "title": "GraphQ IR: Unifying the Semantic Parsing of Graph Query Languages with One Intermediate Representation", "authors": "Lunyiu Nie, Shulin Cao, Jiaxin Shi, Jiuding Sun, Qi Tian, Lei Hou, Juanzi Li and Jidong Zhai", "abstract": "Subject to the huge semantic gap between natural and formal languages, neural semantic parsing is typically bottlenecked by its complexity of dealing with both input semantics and output syntax. Recent works have proposed several forms of supplementary supervision but none is generalized across multiple formal languages. This paper proposes a unified intermediate representation for graph query languages, named GraphQ IR. It has a natural-language-like expression that bridges the semantic gap and formally defined syntax that maintains the graph structure. Therefore, a neural semantic parser can more precisely convert user queries into GraphQ IR, which can be later losslessly compiled into various downstream graph query languages. Extensive experiments on several benchmarks including KQA Pro, Overnight, GrailQA, and MetaQA-Cypher under the standard i.i.d., out-of-distribution, and low-resource settings validate GraphQ IR's superiority over the previous state-of-the-arts with a maximum 11% accuracy improvement.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.007452011108398, 7.692874431610107], "id": 1660, "title": "InforMask: Unsupervised Informative Masking for Language Model Pretraining", "authors": "Nafis Sadeq, Canwen Xu and Julian McAuley", "abstract": "Masked language modeling is widely used for pretraining large language models for natural language understanding (NLU). However, random masking is suboptimal, allocating an equal masking rate for all tokens. In this paper, we propose InforMask, a new unsupervised masking strategy for training masked language models. InforMask exploits Pointwise Mutual Information (PMI) to select the most informative tokens to mask. We further propose two optimizations for InforMask to improve its efficiency. With a one-off preprocessing step, InforMask outperforms random masking and previously proposed masking strategies on the factual recall benchmark LAMA and the question answering benchmark SQuAD v1 and v2.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.666712999343872, 9.76048755645752], "id": 1662, "title": "CTRLsum: Towards Generic Controllable Text Summarization", "authors": "Junxian He, Wojciech Kryscinski, Bryan McCann, Nazneen Rajani and Caiming Xiong", "abstract": "Current summarization systems yield generic summaries that are disconnected from users' preferences and expectations. To address this limitation, we present CTRLsum, a generic framework to control generated summaries through a set of keywords. During training keywords are extracted automatically without requiring additional human annotations. At test time CTRLsum features a control function to map control signal to keywords; through engineering the control function, the same trained model is able to be applied to control summaries on various dimensions, while neither affecting the model training process nor the pretrained models. We additionally explore the combination of keywords and text prompts for more control tasks. Experiments demonstrate the effectiveness of CTRLsum on three domains of summarization datasets and five control tasks: (1) entity-centric and (2) length-controllable summarization, (3) contribution summarization on scientific papers, (4) invention purpose summarization on patent filings, and (5) question-guided summarization on news articles. Moreover, when used in a standard, unconstrained summarization setting, CTRLsum is comparable or better than strong pretrained systems.", "track": "Summarization", "label": 14}, {"loc": [4.49074125289917, 5.381772041320801], "id": 1672, "title": "Missing Counter-Evidence Renders NLP Fact-Checking Unrealistic for Misinformation", "authors": "Max Glockner, Yufang Hou and Iryna Gurevych", "abstract": "Misinformation emerges in times of uncertainty when credible information is limited. This is challenging for NLP-based fact-checking as it relies on counter-evidence, which may not yet be available. Despite increasing interest in automatic fact-checking, it is still unclear if automated approaches can realistically refute harmful real-world misinformation. Here, we contrast and compare NLP fact-checking with how professional fact-checkers combat misinformation in the absence of counter-evidence. In our analysis, we show that, by design, existing NLP task definitions for fact-checking cannot refute misinformation as professional fact-checkers do for the majority of claims. We then define two requirements that the evidence in datasets must fulfill for realistic fact-checking: It must be (1) sufficient to refute the claim and (2) not leaked from existing fact-checking articles. We survey existing fact-checking datasets and find that all of them fail to satisfy both criteria. Finally, we perform experiments to demonstrate that models trained on a large-scale fact-checking dataset rely on leaked evidence, which makes them unsuitable in real-world scenarios. Taken together, we show that current NLP fact-checking cannot realistically combat real-world misinformation because it depends on unrealistic assumptions about counter-evidence in the data.", "track": "Theme Track", "label": 18}, {"loc": [0.6077322959899902, 6.96621561050415], "id": 1687, "title": "A Framework for Adapting Pre-Trained Language Models to Knowledge Graph Completion", "authors": "Justin Lovelace and Carolyn Ros\u00e9", "abstract": "Recent work has demonstrated that entity representations can be extracted from pre-trained language models to develop knowledge graph completion models that are more robust to the naturally occurring sparsity found in knowledge graphs. In this work, we conduct a comprehensive exploration of how to best extract and incorporate those embeddings into knowledge graph completion models. We explore the suitability of the extracted embeddings for direct use in entity ranking and introduce both unsupervised and supervised processing methods that can lead to improved downstream performance. We then introduce supervised embedding extraction methods that can extract more informative representations. We then synthesize our findings and develop a knowledge graph completion model that significantly outperforms recent neural models.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [3.8418712615966797, 9.879581451416016], "id": 1689, "title": "Mutual Information Alleviates Hallucinations in Abstractive Summarization", "authors": "Liam van der Poel, Clara Meister and Ryan Cotterell", "abstract": "Despite significant progress in the quality of language generated from abstractive summarization models, these models still exhibit the tendency to hallucinate, i.e., output content not supported by the source document. A number of works have tried to fix---or at least uncover the source of---the problem with limited success. In this paper, we identify a simple criterion under which models are significantly more likely to assign more probability to hallucinated content during generation: high model uncertainty. This finding offers a potential explanation for hallucinations: models default to favoring text with high marginal probability, i.e., high-frequency occurrences in the training set, when uncertain about a continuation. It also motivates possible routes for real-time intervention during decoding to prevent such hallucinations. We propose a decoding strategy that switches to optimizing for pointwise mutual information of the source and target token---rather than purely the probability of the target token---when the model exhibits uncertainty. Experiments on the \\xsum dataset show that our method decreases the probability of hallucinated tokens while maintaining the Rouge and BERT-S scores of top-performing decoding strategies.", "track": "Summarization", "label": 14}, {"loc": [9.515807151794434, 6.3575286865234375], "id": 1691, "title": "Toward the Limitation of Code-Switching in Cross-Lingual Transfer", "authors": "Yukun Feng, Feng Li and Philipp Koehn", "abstract": "Multilingual pretrained models have shown strong cross-lingual transfer ability. Some works used code-switching sentences, which consist of tokens from multiple languages, to enhance the cross-lingual representation further, and have shown success in many zero-shot cross-lingual tasks. However, code-switched tokens are likely to cause grammatical incoherence in newly substituted sentences, and negatively affect the performance on token-sensitive tasks, such as Part-of-Speech (POS) tagging and Named-Entity-Recognition (NER). This paper mitigates the limitation of the code-switching method by not only making the token replacement but considering the similarity between the context and the switched tokens so that the newly substituted sentences are grammatically consistent during both training and inference. We conduct experiments on cross-lingual POS and NER over 30+ languages, and demonstrate the effectiveness of our method by outperforming the mBERT by 0.95 and original code-switching method by 1.67 on F1 scores.", "track": "Multilinguality", "label": 13}, {"loc": [1.0958553552627563, 7.9238362312316895], "id": 1700, "title": "Syntactically Rich Discriminative Training: An Effective Method for Open Information Extraction", "authors": "Frank Mtumbuka and Thomas Lukasiewicz", "abstract": "Open information extraction (OIE) is the task of extracting facts \"(Subject, Relation, Object)\u201d from natural language text. We propose several new methods for training neural OIE models in this paper. First, we propose a novel method for computing syntactically rich text embeddings using the structure of dependency trees. Second, we propose a new discriminative training approach to OIE in which tokens in the generated fact are classified as \"real\u201d or \"fake\u201d, i.e., those tokens that are in both the generated and gold tuples, and those that are only in the generated tuple but not in the gold tuple. We also address the issue of repetitive tokens in generated facts and improve the models' ability to generate implicit facts. Our approach reduces repetitive tokens by a factor of 23%. Finally, we present paraphrased versions of the CaRB, OIE2016, and LSOIE datasets, and show that the models' performance substantially improves when trained on augmented datasets. Our best model beats the SOTA of IMoJIE on the recent CaRB dataset, with an improvement of 39.63% in F1 score.", "track": "Information Extraction", "label": 5}, {"loc": [0.7339483499526978, 7.455346584320068], "id": 1709, "title": "Transformer-based Entity Typing in Knowledge Graphs", "authors": "Zhiwei Hu, Victor Gutierrez-Basulto, Zhiliang Xiang, Ru Li and Jeff Z. Pan", "abstract": "We investigate the knowledge graph entity typing task which aims at inferring plausible entity types. In this paper, we propose a novel Transformer-based Entity Typing (TET) approach, effectively encoding the content of neighbours of an entity by means of a transformer mechanism. More precisely, TET is composed of three different mechanisms: a local transformer allowing to infer missing entity types by independently encoding the information provided by each of its neighbours; a global transformer aggregating the information of all neighbours of an entity into a single long sequence to reason about more complex entity types; and a context transformer integrating neighbours content in a differentiated way through information exchange between neighbour pairs, while preserving the graph structure. Furthermore, TET uses information about class membership of types to semantically strengthen the representation of an entity. Experiments on two real-world datasets demonstrate the superior performance of TET compared to the state-of-the-art.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.487758159637451, 5.519023418426514], "id": 1734, "title": "NewsClaims: A New Benchmark for Claim Detection from News with Attribute Knowledge", "authors": "Revanth Gangi Reddy, Sai Chetan Chinthakindi, Zhenhailong Wang, Yi Fung, Kathryn Conger, Ahmed ELsayed, Martha Palmer, Preslav Nakov, Eduard Hovy, Kevin Small and Heng Ji", "abstract": "Claim detection and verification are crucial for news understanding and have emerged as promising technologies for mitigating misinformation and disinformation in the news. However, most existing work has focused on claim sentence analysis while overlooking additional crucial attributes (e.g., the claimer and the main object associated with the claim).In this work, we present NewsClaims, a new benchmark for attribute-aware claim detection in the news domain. We extend the claim detection problem to include extraction of additional attributes related to each claim and release 889 claims annotated over 143 news articles. NewsClaims aims to benchmark claim detection systems in emerging scenarios, comprising unseen topics with little or no training data. To this end, we see that zero-shot and prompt-based baselines show promising performance on this benchmark, while still considerably behind human performance.", "track": "Ethic Concerns:Resources and Evaluation", "label": 1}, {"loc": [9.233291625976562, 6.261379241943359], "id": 1756, "title": "IsoVec: Controlling the Relative Isomorphism of Word Embedding Spaces", "authors": "Kelly Marchisio, Neha Verma, Kevin Duh and Philipp Koehn", "abstract": "The ability to extract high-quality translation dictionaries from monolingual word embedding spaces depends critically on the geometric similarity of the spaces---their degree of \"isomorphism.\" We address the root-cause of faulty cross-lingual mapping: that word embedding training resulted in the underlying spaces being non-isomorphic. \nWe incorporate global measures of isomorphism directly into the skipgram loss function, successfully increasing the relative isomorphism of trained word embedding spaces and improving their ability to be mapped to a shared cross-lingual space. The result is improved bilingual lexicon induction in general data conditions, under domain mismatch, and with training algorithm dissimilarities. \nWe release IsoVec at https://github.com/kellymarchisio/isovec.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [7.721404075622559, 3.55267071723938], "id": 1772, "title": "Adversarial Concept Erasure in Kernel Space", "authors": "Shauli Ravfogel, Francisco Vargas\u202c, Yoav Goldberg and Ryan Cotterell", "abstract": "The representation space of neural models for textual data emerges in an unsupervised manner during training. Understanding how human-interpretable concepts, such as gender, are encoded in these representations would improve the ability of users to control the content of these representations and analyze the working of the models that rely on them. One prominent approach to the control problem is the identification and removal of linear concept subspaces -- subspaces in the representation space that correspond to a given concept. While those are tractable and interpretable, neural network do not necessarily represent concepts in linear subspaces. \n\nWe propose a kernelization of the recently-proposed linear concept-removal objective, and show that it is effective in guarding against the ability of certain nonlinear adversaries to recover the concept. Interestingly, our findings suggest that the division between linear and nonlinear models is overly simplistic: when considering the concept of binary gender and its neutralization, we do not find a single kernel space that exclusively contains all the concept-related information. It is therefore challenging to protect against all nonlinear adversaries at once.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.5214009284973145, 8.466492652893066], "id": 1784, "title": "The Authenticity Gap in Human Evaluation", "authors": "Kawin Ethayarajh and Dan Jurafsky", "abstract": "Human ratings are the gold standard in NLG evaluation. The standard protocol is to collect ratings of generated text, average across annotators, and rank NLG systems by their average scores. However, little consideration has been given as to whether this approach faithfully captures human preferences. Analyzing this standard protocol through the lens of utility theory in economics, we identify the implicit assumptions it makes about annotators. These assumptions are often violated in practice, in which case annotator ratings cease to reflect their preferences. The most egregious violations come from using Likert scales, which provably reverse the direction of the true preference in certain cases. We suggest improvements to the standard protocol to make it more theoretically sound, but even in its improved form, it cannot be used to evaluate open-ended tasks like story generation. For the latter, we propose a new human evaluation protocol called system-level probabilistic assessment (SPA). When human evaluation of stories is done with SPA, we can recover the ordering of GPT-3 models by size, with statistically significant results. However, when human evaluation is done with the standard protocol, less than half of the expected preferences can be recovered (e.g., there is no significant difference between curie and davinci, despite using a highly powered test).", "track": "Theme Track", "label": 18}, {"loc": [6.422831058502197, 5.934441089630127], "id": 1790, "title": "BERT in Plutarch's Shadows", "authors": "Ivan Yamshchikov, Alexey Tikhonov, Yorgos Pantis, Charlotte Schubert and J\u00fcrgen Jost", "abstract": "The extensive surviving corpus of the ancient scholar Plutarch of Chaeronea (ca. 45-120 CE) also contains several texts which, according to current scholarly opinion, did not originate with him and are therefore attributed to an anonymous author Pseudo-Plutarch. These include, in particular, the work Placita Philosophorum (Quotations and Opinions of the Ancient Philosophers), which is extremely important for the history of ancient philosophy. Little is known about the identity of that anonymous author and its relation to other authors from the same period. This paper presents a BERT language model for Ancient Greek. The model discovers previously unknown statistical properties relevant to these literary, philosophical, and historical problems and can shed new light on this authorship question. In particular, the Placita Philosophorum, together with one of the other Pseudo-Plutarch texts, shows similarities with the texts written by authors from an Alexandrian context (2nd/3rd century CE).", "track": "NLP Applications", "label": 0}, {"loc": [3.834892511367798, 9.974461555480957], "id": 1798, "title": "Leveraging Locality in Abstractive Text Summarization", "authors": "Yixin Liu, Ansong Ni, Linyong Nan, Budhaditya Deb, Chenguang Zhu, Ahmed Hassan Awadallah and Dragomir Radev", "abstract": "Neural attention models have achieved significant improvements on many natural language processing tasks. However, the quadratic memory complexity of the self-attention module with respect to the input length hinders their applications in long text summarization. Instead of designing more efficient attention modules, we approach this problem by investigating if models with a restricted context can have competitive performance compared with the memory-efficient attention models that maintain a global context by treating the input as a single sequence. Our model is applied to individual pages, which contain parts of inputs grouped by the principle of locality, during both the encoding and decoding stages. We empirically investigated three kinds of locality in text summarization at different levels of granularity, ranging from sentences to documents. Our experimental results show that our model has a better performance compared with strong baseline models with efficient attention modules, and our analysis provides further insights into our locality-aware modeling strategy.", "track": "Summarization", "label": 14}, {"loc": [3.7552075386047363, 9.785235404968262], "id": 1804, "title": "Salience Allocation as Guidance for Abstractive Summarization", "authors": "Fei Wang, Kaiqiang Song, Hongming Zhang, Lifeng Jin, Sangwoo Cho, Wenlin Yao, Xiaoyang Wang, Muhao Chen and Dong Yu", "abstract": "Abstractive summarization models typically learn to capture the salient information from scratch implicitly.\nRecent literature adds extractive summaries as guidance for abstractive summarization models to provide hints of salient content and achieves better performance.\nHowever, extractive summaries as guidance could be over strict, leading to information loss or noisy signals.\nFurthermore, it cannot easily adapt to documents with various abstractiveness.\nAs the number and allocation of salience content pieces varies, it is hard to find a fixed threshold deciding which content should be included in the guidance.\nIn this paper, we propose a novel summarization approach with a flexible and reliable salience guidance, namely SEASON (SaliencE Allocation as Guidance for Abstractive SummarizatiON).\nSEASON utilizes the allocation of salience expectation to guide abstractive summarization and adapts well to articles in different abstractiveness.\nAutomatic and human evaluations on two benchmark datasets show that the proposed method is effective and reliable.\nEmpirical results on more than one million news articles demonstrate a natural fifteen-fifty salience split for news article sentences, providing a useful insight for composing news articles.", "track": "Summarization", "label": 14}, {"loc": [7.787707805633545, 8.1945219039917], "id": 1807, "title": "Fine-tuned Language Models are Continual Learners", "authors": "Thomas Scialom, Tuhin Chakrabarty and Smaranda Muresan", "abstract": "Recent work on large language models relies on the intuition that most natural language processing tasks can be described via natural language instructions and that models trained on these instructions show strong zero-shot performance on several standard datasets. However, these models even though impressive still perform poorly on a wide range of tasks outside of their respective training and evaluation sets.To address this limitation, we argue that a model should be able to keep extending its knowledge and abilities, without forgetting previous skills. In spite of the limited success of Continual Learning, we show that \\emph{Fine-tuned Language Models can be continual learners}.We empirically investigate the reason for this success and conclude that Continual Learning emerges from self-supervision pre-training. Our resulting model Continual-T0 (CT0) is able to learn 8 new diverse language generation tasks, while still maintaining good performance on previous tasks, spanning in total of 70 datasets. Finally, we show that CT0 is able to combine instructions in ways it was never trained for, demonstrating some level of instruction compositionality.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.8568220138549805, 4.208720684051514], "id": 1814, "title": "Natural Logic-guided Autoregressive Multi-hop Document Retrieval for Fact Verification", "authors": "Rami Aly and Andreas Vlachos", "abstract": "A key component of fact verification is the evidence retrieval, often from multiple documents. Recent approaches use dense representations and condition the retrieval of each document on the previously retrieved ones. The latter step is performed over all the documents in the collection, requiring storing their dense representations in an index, thus incurring a high memory footprint. An alternative paradigm is retrieve-and-rerank, where documents are retrieved using methods such as BM25, their sentences are reranked, and further documents are retrieved conditioned on these sentences, reducing the memory requirements. However, such approaches can be brittle as they rely on heuristics and assume hyperlinks between documents.\n\nWe propose a novel retrieve-and-rerank method for multi-hop retrieval, that consists of a retriever that \njointly scores documents in the knowledge source and sentences from previously retrieved documents using an autoregressive formulation and is guided by a proof system based on natural logic that dynamically terminates the retrieval process if the evidence is deemed sufficient.\n\nThis method exceeds or is on par with the current state-of-the-art on FEVER, HoVer and FEVEROUS-S, \nwhile using 5 to 10 times less memory than competing systems. Evaluation on an adversarial dataset indicates improved stability of our approach compared to commonly deployed threshold-based methods. Finally, the proof system helps humans predict model decisions correctly more often than using the evidence alone.", "track": "NLP Applications", "label": 0}, {"loc": [1.048694133758545, 10.530959129333496], "id": 1817, "title": "AX-MABSA: A Framework for Extremely Weakly Supervised Multi-label Aspect Based Sentiment Analysis", "authors": "Sabyasachi Kamila, Walid Magdy, Sourav Dutta and MingXue Wang", "abstract": "Aspect Based Sentiment Analysis is a dominant research area with potential applications in social media analytics, business, finance, and health. Prior works in this area are primarily based on supervised methods, with a few techniques using weak supervision limited to predicting a single aspect category per review sentence. In this paper, we present an extremely weakly supervised multi-label Aspect Category Sentiment Analysis framework which does not use any labelled data. We only rely on a single word per class as an initial indicative information. We further propose an automatic word selection technique to choose these seed categories and sentiment words. We explore unsupervised language model post-training to improve the overall performance, and propose a multi-label generator model to generate multiple aspect category-sentiment pairs per review sentence. Experiments conducted on four benchmark datasets showcase our method to outperform other weakly supervised baselines by a significant margin.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.7261576652526855, 10.876175880432129], "id": 1820, "title": "Transfer Learning with Synthetic Corpora for Spatial Role Labeling and Reasoning", "authors": "Roshanak Mirzaee and Parisa Kordjamshidi", "abstract": "Recent research shows synthetic data as a source of supervision helps pretrained language models (PLM) transfer learning to new target tasks/domains. However, this idea is less explored for spatial language. We provide two new data resources on multiple spatial language processing tasks. The first dataset is synthesized for transfer learning on spatial question answering (SQA) and spatial role labeling (SpRL). Compared to previous SQA datasets, we include a larger variety of spatial relation types and spatial expressions. Our data generation process is easily extendable with new spatial expression lexicons. The second one is a real-world SQA dataset with human-generated questions built on an existing corpus with SPRL annotations. This dataset can be used to evaluate spatial language processing models in realistic situations. We show pretraining with automatically generated data significantly improves the SOTA results on several SQA and SPRL benchmarks, particularly when the training data in the target domain is small.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.927734851837158, 9.166131973266602], "id": 1830, "title": "A Survey of Active Learning for Natural Language Processing", "authors": "Zhisong Zhang, Emma Strubell and Eduard Hovy", "abstract": "In this work, we provide a literature review of active learning (AL) for its applications in natural language processing (NLP). In addition to a fine-grained categorization of query strategies, we also investigate several other important aspects of applying AL to NLP problems. These include AL for structured prediction tasks, annotation cost, model learning (especially with deep neural models), and starting and stopping AL. Finally, we conclude with a discussion of related topics and future directions.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [9.09914493560791, 6.487714767456055], "id": 1840, "title": "Bernice: A Multilingual Pre-trained Encoder for Twitter", "authors": "Alexandra DeLucia, Shijie Wu, Aaron Mueller, Carlos Aguirre, Philip Resnik and Mark Dredze", "abstract": "The language of Twitter differs significantly from that of other domains commonly included in large language model training. \nWhile tweets are typically multilingual and contain informal language, including emoji and hashtags, most pre-trained language models for Twitter are either monolingual, adapted from other domains rather than trained exclusively on Twitter, or are trained on a limited amount of in-domain Twitter data.\nWe introduce Bernice, the first multilingual RoBERTa language model trained from scratch on 2.5 billion tweets with a custom tweet-focused tokenizer. \nWe evaluate on a variety of monolingual and multilingual Twitter benchmarks, finding that our model consistently exceeds or matches the performance of a variety of models adapted to social media data as well as strong multilingual baselines, despite being trained on less data overall.\nWe posit that it is more efficient compute- and data-wise to train completely on in-domain data with a specialized domain-specific tokenizer.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.0577192306518555, 8.318950653076172], "id": 1851, "title": "CEFR-Based Sentence Difficulty Annotation and Assessment", "authors": "Yuki Arase, Satoru Uchida and Tomoyuki Kajiwara", "abstract": "Controllable text simplification is a crucial assistive technique for language learning and teaching. \nOne of the primary factors hindering its advancement is the lack of a corpus annotated with sentence difficulty levels based on language ability descriptions. \nTo address this problem, we created the CEFR-based Sentence Profile (CEFR-SP) corpus, containing 17k English sentences annotated with the levels based on the Common European Framework of Reference for Languages assigned by English-education professionals. \nIn addition, we propose a sentence-level assessment model to handle unbalanced level distribution because the most basic and highly proficient sentences are naturally scarce. \nIn the experiments in this study, our method achieved a macro-F1 score of 84.5% in the level assessment, thus outperforming strong baselines employed in readability assessment.", "track": "Resources and Evaluation", "label": 1}, {"loc": [1.7825573682785034, 8.780869483947754], "id": 1857, "title": "Simple Questions Generate Named Entity Recognition Datasets", "authors": "Hyunjae Kim, jaehyo yoo, Seunghyun Yoon, Jinhyuk Lee and Jaewoo Kang", "abstract": "Recent named entity recognition (NER) models often rely on human-annotated datasets requiring the vast engagement of professional knowledge on the target domain and entities. This work introduces an ask-to-generate approach, which automatically generates NER datasets by asking simple natural language questions to an open-domain question answering system (e.g., \"Which disease?\"). Despite using fewer training resources, our models solely trained on the generated datasets largely outperform strong low-resource models by 19.5 F1 score across six popular NER benchmarks. Our models also show competitive performance with rich-resource models that additionally leverage in-domain dictionaries provided by domain experts. In few-shot NER, we outperform the previous best model by 5.2 F1 score on three benchmarks and achieve new state-of-the-art performance.", "track": "Information Extraction", "label": 5}, {"loc": [8.941607475280762, 8.026017189025879], "id": 1872, "title": "TemporalWiki: A Lifelong Benchmark for Training and Evaluating Ever-Evolving Language Models", "authors": "Joel Jang, Seonghyeon Ye, Changho Lee, Sohee Yang, Joongbo Shin, Janghoon Han, Gyeonghun Kim and Minjoon Seo", "abstract": "Language Models (LMs) become outdated as the world changes; they often fail to perform tasks requiring recent factual information which was absent or different during training, a phenomenon called temporal misalignment. This is especially a challenging problem because the research community still lacks a coherent dataset for assessing the adaptability of LMs to frequently-updated knowledge corpus such as Wikipedia. To this end, we introduce TemporalWiki, a lifelong benchmark for ever-evolving LMs that utilizes the difference between consecutive snapshots of English Wikipedia and English Wikidata for training and evaluation, respectively. The benchmark hence allows researchers to periodically track an LM's ability to retain previous knowledge and acquire updated/new knowledge at each point in time. We also find that training an LM on the diff data through continual learning methods achieves similar or better perplexity than on the entire snapshot in our benchmark with 12 times less computational cost, which verifies that factual knowledge in LMs can be safely updated with minimal training data via continual learning.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.4104552268981934, 7.3146233558654785], "id": 1884, "title": "Bi-Directional Iterative Prompt-Tuning for Event Argument Extraction", "authors": "Lu Dai, Bang Wang, Wei Xiang and yijun mo", "abstract": "Recently, prompt-tuning has attracted growing interests in event argument extraction (EAE). However, the existing prompt-tuning methods have not achieved satisfactory performance due to the lack of consideration of entity information. In this paper, we propose a bi-directional iterative prompt-tuning method for EAE, where the EAE task is treated as a cloze-style task to take full advantage of entity information and pre-trained language models (PLMs). Furthermore, our method explores event argument interactions by introducing the argument roles of contextual entities into prompt construction. Since template and verbalizer are two crucial components in a cloze-style prompt, we propose to utilize the role label semantic knowledge to construct a semantic verbalizer and design three kind of templates for the EAE task. Experiments on the ACE 2005 English dataset with standard and low-resource settings show that the proposed method significantly outperforms the peer state-of-the-art methods.", "track": "Information Extraction", "label": 5}, {"loc": [0.742217481136322, 7.943090915679932], "id": 1894, "title": "Learning Robust Representations for Continual Relation Extraction via Adversarial Class Augmentation", "authors": "Peiyi Wang, Yifan Song, Tianyu Liu, Binghuai Lin, Yunbo Cao, Sujian Li and Zhifang Sui", "abstract": "Continual relation extraction (CRE) aims to continually learn new relations from a class-incremental data stream. CRE model usually suffers from catastrophic forgetting problem, i.e., the performance of old relations seriously degrades when the model learns new relations. Most previous work attributes catastrophic forgetting to the corruption of the learned representations as new relations come, with an implicit assumption that the CRE models have adequately learned the old relations. In this paper, through empirical studies we argue that this assumption may not hold, and an important reason for catastrophic forgetting is that the learned representations do not have good robustness against the appearance of analogous relations in the subsequent learning process. To address this issue, we encourage the model to learn more precise and robust representations through a simple yet effective adversarial class augmentation mechanism (ACA), which is easy to implement and model-agnostic.\nExperimental results show that ACA can consistently improve the performance of state-of-the-art CRE models on two popular benchmarks.", "track": "Information Extraction", "label": 5}, {"loc": [3.8870460987091064, 4.14500617980957], "id": 1895, "title": "ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering", "authors": "Zhiyu Chen, Shiyang Li, Charese Smiley, Zhiqiang Ma, Sameena Shah and William Yang Wang", "abstract": "With the recent advance in large pre-trained language models, researchers have achieved record performances in NLP tasks that mostly focus on language pattern matching. The community is experiencing the shift of the challenge from how to model language to the imitation of complex reasoning abilities like human beings. In this work, we investigate the application domain of finance that involves real-world, complex numerical reasoning. We propose a new large-scale dataset, ConvFinQA, aiming to study the chain of numerical reasoning in conversational question answering. Our dataset poses great challenge in modeling long-range, complex numerical reasoning paths in real-world conversations. We conduct comprehensive experiments and analyses with both the neural symbolic methods and the prompting-based methods, to provide insights into the reasoning mechanisms of these two divisions. We believe our new dataset should serve as a valuable resource to push forward the exploration of real-world, complex reasoning tasks as the next research focus. \nOur dataset and code is publicly available at https://github.com/czyssrs/ConvFinQA.", "track": "Question Answering", "label": 11}, {"loc": [5.628740310668945, 12.70927906036377], "id": 1903, "title": "A Span-based Multimodal Variational Autoencoder for Semi-supervised Multimodal Named Entity Recognition", "authors": "Baohang Zhou, Ying Zhang, Kehui Song, Wenya Guo, Guoqing Zhao, hongbin wang and Xiaojie Yuan", "abstract": "Multimodal named entity recognition (MNER) on social media is a challenging task which aims to extract named entities in free text and incorporate images to classify them into user-defined types. However, the annotation for named entities on social media demands a mount of human efforts. The existing semi-supervised named entity recognition methods focus on the text modal and are utilized to reduce labeling costs in traditional NER. However, the previous methods are not efficient for semi-supervised MNER. Because the MNER task is defined to combine the text information with image one and needs to consider the mismatch between the posted text and image. To fuse the text and image features for MNER effectively under semi-supervised setting, we propose a novel span-based multimodal variational autoencoder (SMVAE) model for semi-supervised MNER. The proposed method exploits modal-specific VAEs to model text and image latent features, and utilizes product-of-experts to acquire multimodal features. In our approach, the implicit relations between labels and multimodal features are modeled by multimodal VAE. Thus, the useful information of unlabeled data can be exploited in our method under semi-supervised setting. Experimental results on two benchmark datasets demonstrate that our approach not only outperforms baselines under supervised setting, but also improves MNER performance with less labeled data than existing semi-supervised methods.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.878812789916992, 9.94894027709961], "id": 1904, "title": "R-TeaFor: Regularized Teacher-Forcing for Abstractive Summarization", "authors": "Guan-Yu Lin and Pu-Jen Cheng", "abstract": "Teacher-forcing is widely used in training sequence generation models to improve sampling efficiency and to stabilize training. However, teacher-forcing is vulnerable to the exposure bias problem. Previous works have attempted to address exposure bias by modifying the training data to simulate model-generated results. Nevertheless, they do not consider the pairwise relationship between the original training data and the modified ones, which provides more information during training. Hence, we propose Regularized Teacher-Forcing (R-TeaFor) to utilize this relationship for better regularization. Empirically, our experiments show that R-TeaFor outperforms previous summarization state-of-the-art models, and the results can be generalized to different pre-trained models.", "track": "Summarization", "label": 14}, {"loc": [10.580028533935547, 7.091395378112793], "id": 1915, "title": "Modeling Consistency Preference via Lexical Chains for Document-level Neural Machine Translation", "authors": "Xinglin Lyu, Junhui Li, shimin tao, Hao Yang, Ying Qin and Min Zhang", "abstract": "In this paper we aim to relieve the issue of lexical translation inconsistency for document-level neural machine translation (NMT) by modeling consistency preference for lexical chains, which consist of repeated words in a source-side document and provide a representation of the lexical consistency structure of the document. Specifically, we first propose lexical-consistency attention to capture consistency context among words in the same lexical chains. Then for each lexical chain we define and learn a consistency-tailored latent variable, which will guide the translation of corresponding sentences to enhance lexical translation consistency. Experimental results on Chinese\u2192English and French\u2192English document-level translation tasks show that our approach not only significantly improves translation performance in BLEU, but also substantially alleviates the problem of the lexical translation inconsistency.", "track": "Machine Translation", "label": 10}, {"loc": [8.129496574401855, 2.99873948097229], "id": 1920, "title": "Just Fine-tune Twice: Selective Differential Privacy for Large Language Models", "authors": "Weiyan Shi, Ryan Patrick Shea, Si Chen, Chiyuan Zhang, Ruoxi Jia and Zhou Yu", "abstract": "Protecting large language models from privacy leakage is becoming increasingly crucial with their wide adoption in real-world products. Yet applying *differential privacy* (DP), a canonical notion with provable privacy guarantees for machine learning models, to those models remains challenging due to the trade-off between model utility and privacy loss. Utilizing the fact that sensitive information in language data tends to be sparse, Shi et al. (2021) formalized a DP notion extension called *Selective Differential Privacy* (SDP) to protect only the sensitive tokens defined by a policy function. However, their algorithm only works for RNN-based models. In this paper, we develop a novel framework, *Just Fine-tune Twice* (JFT), that achieves SDP for state-of-the-art large transformer-based models. Our method is easy to implement: it first fine-tunes the model with *redacted* in-domain data, and then fine-tunes it again with the *original* in-domain data using a private training mechanism. Furthermore, we study the scenario of imperfect implementation of policy functions that misses sensitive tokens and develop systematic methods to handle it. Experiments show that our method achieves strong utility compared to previous baselines. We also analyze the SDP privacy guarantee empirically with the canary insertion attack.", "track": "Ethics", "label": 21}, {"loc": [3.6276538372039795, 9.820080757141113], "id": 1922, "title": "Factorizing Content and Budget Decisions in Abstractive Summarization of Long Documents", "authors": "Marcio Fonseca, Yftah Ziser and Shay B. Cohen", "abstract": "We argue that disentangling content selection from the budget used to cover salient content improves the performance and applicability of abstractive summarizers. Our method, FactorSum, does this disentanglement by factorizing summarization into two steps through an energy function: (1) generation of abstractive summary views covering salient information in subsets of the input document (document views); (2) combination of these views into a final summary, following a budget and content guidance. This guidance may come from different sources, including from an advisor model such as BART or BigBird, or in oracle mode -- from the reference. This factorization achieves significantly higher ROUGE scores on multiple benchmarks for long document summarization, namely PubMed, arXiv, and GovReport. Most notably, our model is effective for domain adaptation. When trained only on PubMed samples, it achieves a 46.29 ROUGE-1 score on arXiv, outperforming PEGASUS trained in domain by a large margin. Our experimental results indicate that the performance gains are due to more flexible budget adaptation and processing of shorter contexts provided by partial document views.", "track": "Summarization", "label": 14}, {"loc": [6.068068981170654, 12.478471755981445], "id": 1926, "title": "Open-Domain Sign Language Translation Learned from Online Video", "authors": "Bowen Shi, Diane K. Brentari, Gregory Shakhnarovich and Karen Livescu", "abstract": "Existing work on sign language translation -- that is, translation from sign language videos into sentences in a written language -- has focused mainly on (1) data collected in a controlled environment or (2) data in a specific domain, which limits the applicability to real-world settings. In this paper, we introduce OpenASL, a large-scale American Sign Language (ASL) - English dataset collected from online video sites (e.g., YouTube).\nOpenASL contains 288 hours of ASL videos in multiple domains from over 200 signers and is the largest publicly available ASL translation dataset to date. To tackle the challenges of sign language translation in realistic settings and without glosses, we propose a set of techniques including sign search as a pretext task for pre-training and fusion of mouthing and handshape features. The proposed techniques produce consistent and large improvements in translation quality, over baseline models based\non prior work.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.924873352050781, 7.937226295471191], "id": 1930, "title": "Improving Temporal Generalization of Pre-trained Language Models with Lexical Semantic Change", "authors": "Zhaochen Su, Zecheng Tang, xinyan guan, Lijun Wu, Min Zhang and Juntao Li", "abstract": "Recent research has revealed that neural language models at scale suffer from poor temporal generalization capability, i.e., language model pre-trained on static data from past years performs worse over time on emerging data. Existing methods mainly perform continual training to mitigate such a misalignment. While effective to some extent but is far from being addressed on both the language modeling and downstream tasks. In this paper, we empirically observe that temporal generalization is closely affiliated with lexical semantic change, which is one of the essential phenomena of natural languages. Based on this observation, we propose a simple yet effective lexical-level masking strategy to post-train a converged language model. Experiments on two pre-trained language models, two different classification tasks, and four benchmark datasets demonstrate the effectiveness of our proposed method over existing temporal adaptation methods, i.e., continual training with new data. Our code is available at https://github.com/zhaochen0110/LMLM.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.691197395324707, 10.660399436950684], "id": 1934, "title": "ULN: Towards Underspecified Vision-and-Language Navigation", "authors": "Weixi Feng, Tsu-Jui Fu, Yujie Lu and William Yang Wang", "abstract": "Vision-and-Language Navigation (VLN) is a task to guide an embodied agent moving to a target position using language instructions. Despite the significant performance improvement, the wide use of fine-grained instructions fails to characterize more practical linguistic variations in reality. To fill in this gap, we introduce a new setting, namely Underspecified vision-and-Language Navigation (ULN), and associated evaluation datasets. ULN evaluates agents using multi-level underspecified instructions instead of purely fine-grained or coarse-grained, which is a more realistic and general setting. As a primary step toward ULN, we propose a VLN framework that consists of a classification module, a navigation agent, and an Exploitation-to-Exploration (E2E) module. Specifically, we propose to learn Granularity Specific Sub-networks (GSS) for the agent to ground multi-level instructions with minimal additional parameters. Then, our E2E module estimates grounding uncertainty and conducts multi-step lookahead exploration to improve the success rate further. Experimental results show that existing VLN models are still brittle to multi-level language underspecification. Our framework is more robust and outperforms the baselines on ULN by ~10% relative success rate across all levels.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.085122108459473, 3.07778263092041], "id": 1935, "title": "Federated Model Decomposition with Private Vocabulary for Text Classification", "authors": "Zhuo Zhang, Xiangjing Hu, Lizhen Qu, Qifan Wang and Zenglin Xu", "abstract": "With the necessity of privacy protection, it becomes increasingly vital to train deep neural models in a federated learning manner for natural language processing (NLP) tasks. However, recent studies show eavesdroppers (i.e., dishonest servers) can still reconstruct the private input in federated learning (FL). Such a data reconstruction attack relies on the mappings between vocabulary and associated word embedding in NLP tasks, which are unfortunately less studied in current FL methods. In this paper, we propose a fedrated model decomposition method that protects the privacy of vocabularies, shorted as FEDEVOCAB. In FEDEVOCAB, each participant keeps the local embedding layer in the local device and detaches the local embedding parameters from federated aggregation. However, it is challenging to train an accurate NLP model when the private mappings are unknown and vary across participants in a cross-device FL setting. To address this problem, we further propose an adaptive updating technique to improve the performance of local models. Experimental results show that FEDEVOCAB maintains competitive performance and provides better privacy-preserving capacity compared to status quo methods.", "track": "NLP Applications", "label": 0}, {"loc": [5.040955543518066, 5.4097113609313965], "id": 1958, "title": "ReCo: Reliable Causal Chain Reasoning via Structural Causal Recurrent Neural Networks", "authors": "Kai Xiong, Xiao Ding, Zhongyang Li, Li Du, Ting Liu, Bing Qin, Yi Zheng and baoxing Huai", "abstract": "Causal chain reasoning (CCR) is an essential ability for many decision-making AI systems, which requires the model to build reliable causal chains by connecting causal pairs. However, CCR suffers from two main transitive problems: threshold effect and scene drift. In other words, the causal pairs to be spliced may have a conflicting threshold boundary or scenario.\nTo address these issues, we propose a novel Reliable Causal chain reasoning framework (ReCo), which introduces exogenous variables to represent the threshold and scene factors of each causal pair within the causal chain, and estimates the threshold and scene contradictions across exogenous variables via structural causal recurrent neural networks (SRNN). Experiments show that ReCo outperforms a series of strong baselines on both Chinese and English CCR datasets. Moreover, by injecting reliable causal chain knowledge distilled by ReCo, BERT can achieve better performances on four downstream causal-related tasks than BERT models enhanced by other kinds of knowledge.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [5.542981147766113, 11.663244247436523], "id": 1961, "title": "Video Question Answering: Datasets, Algorithms and Challenges", "authors": "Yaoyao Zhong, Wei Ji, Junbin Xiao, Yicong Li, Weihong Deng and Tat-Seng Chua", "abstract": "This survey aims to sort out the recent advances in video question answering (VideoQA) and point towards future directions. We firstly categorize the datasets into 1) normal VideoQA, multi-modal VideoQA and knowledge-based VideoQA, according to the modalities invoked in the question-answer pairs, or 2) factoid VideoQA and inference VideoQA, according to the technical challenges in comprehending the questions and deriving the correct answers. We then summarize the VideoQA techniques, including those mainly designed for Factoid QA (e.g., the early spatio-temporal attention-based methods and the recently Transformer-based ones) and those targeted at explicit relation and logic inference (e.g., neural modular networks, neural symbolic methods, and graph-structured methods). Aside from the backbone techniques, we delve into the specific models and find out some common and useful insights either for video modeling, question answering, or for cross-modal correspondence learning. Finally, we point out the research trend of studying beyond factoid VideoQA to inference VideoQA, as well as towards the robustness and interpretability. Additionally, we maintain a repository, https://github.com/VRU-NExT/VideoQA, to keep trace of the latest VideoQA papers, datasets, and their open-source implementations if available. \nWith these efforts, we strongly hope this survey could shed light on the follow-up VideoQA research.", "track": "Question Answering", "label": 11}, {"loc": [7.972160339355469, 5.789251327514648], "id": 1962, "title": "Retrofitting Multilingual Sentence Embeddings with Abstract Meaning Representation", "authors": "Deng Cai, Xin Li, Jackie Chun-Sing Ho, Lidong Bing and Wai Lam", "abstract": "We introduce a new method to improve existing multilingual sentence embeddings with Abstract Meaning Representation (AMR). Compared with the original textual input, AMR is a structured semantic representation that presents the core concepts and relations in a sentence explicitly and unambiguously. It also helps reduce the surface variations across different expressions and languages. Unlike most prior work that only evaluates the ability to measure semantic similarity, we present a thorough evaluation of existing multilingual sentence embeddings and our improved versions, which include a collection of five transfer tasks in different downstream applications. Experiment results show that retrofitting multilingual sentence embeddings with AMR leads to better state-of-the-art performance on both semantic textual similarity and transfer tasks.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [10.6474027633667, 7.39369535446167], "id": 1969, "title": "Breaking the Representation Bottleneck of Chinese Characters: Neural Machine Translation with Stroke Sequence Modeling", "authors": "Zhijun Wang, Xuebo Liu and Min Zhang", "abstract": "Existing research generally treats Chinese character as a minimum unit for representation. However, such Chinese character representation will suffer two bottlenecks: 1) Learning bottleneck, the learning cannot benefit from its rich internal features (e.g., radicals and strokes); and 2) Parameter bottleneck, each individual character has to be represented by a unique vector. In this paper, we introduce a novel representation method for Chinese characters to break the bottlenecks, namely StrokeNet, which represents a Chinese character by a Latinized stroke sequence (e.g., \"\u51f9 (concave)\" to \"ajaie\" and \"\u51f8 (convex)\" to \"aeaqe\"). Specifically, StrokeNet maps each stroke to a specific Latin character, thus allowing similar Chinese characters to have similar Latin representations. With the introduction of StrokeNet to neural machine translation (NMT), many powerful but not applicable techniques to non-Latin languages (e.g., shared subword vocabulary learning and ciphertext-based data augmentation) can now be perfectly implemented. Experiments on the widely-used NIST Chinese-English, WMT17 Chinese-English and IWSLT17 Japanese-English NMT tasks show that StrokeNet can provide a significant performance boost over the strong baselines with fewer model parameters, achieving 26.5 BLEU on the WMT17 Chinese-English task which is better than any previously reported results without using monolingual data. Code and scripts are freely available at https://github.com/zjwang21/StrokeNet.", "track": "Machine Translation", "label": 10}, {"loc": [0.9955535531044006, 10.487765312194824], "id": 1978, "title": "Boundary-Driven Table-Filling for Aspect Sentiment Triplet Extraction", "authors": "Yice Zhang, Yifan Yang, Yihui Li, Bin Liang, Shiwei Chen, Yixue Dang, Min Yang and Ruifeng Xu", "abstract": "Aspect Sentiment Triplet Extraction (ASTE) aims to extract the aspect terms along with the corresponding opinion terms and the expressed sentiments in the review, which is an important task in sentiment analysis. Previous research efforts generally address the ASTE task in an end-to-end fashion through the table-filling formalization, in which the triplets are represented by a two-dimensional (2D) table of word-pair relations. Under this formalization, a term-level relation is decomposed into multiple independent word-level relations, which leads to relation inconsistency and boundary insensitivity in the face of multi-word aspect terms and opinion terms. To overcome these issues, we propose Boundary-Driven Table-Filling (BDTF), which represents each triplet as a relation region in the 2D table and transforms the ASTE task into detection and classification of relation regions. We also notice that the quality of the table representation greatly affects the performance of BDTF. Therefore, we develop an effective relation representation learning approach to learn the table representation, which can fully exploit both word-to-word interactions and relation-to-relation interactions. Experiments on several public benchmarks show that the proposed approach achieves state-of-the-art performances.", "track": "NLP Applications", "label": 0}, {"loc": [1.708221435546875, 8.741342544555664], "id": 1990, "title": "Attention and Edge-Label Guided Graph Convolutional Networks for Named Entity Recognition", "authors": "Renjie Zhou, Zhongyi Xie, Jian Wan, Jilin Zhang, Yong Liao and Qiang Liu", "abstract": "It has been shown that named entity recognition (NER) could benefit from incorporating the long-distance structured information captured by dependency trees. However, dependency trees built by tools usually have a certain percentage of errors. Under such circumstances, how to better use relevant structured information while ignoring irrelevant or wrong structured information from the dependency trees to improve NER performance is still a challenging research problem. In this paper, we propose the Attention and Edge-Label guided Graph Convolution Network (AELGCN) model. Then, we integrate it into BiLSTM-CRF to form BiLSTM-AELGCN-CRF model. We design an edge-aware node joint update module and introduce a node-aware edge update module to explore hidden in structured information entirely and solve the wrong dependency label information to some extent. After two modules, we apply attention-guided GCN, which automatically learns how to attend to the relevant structured information selectively. We conduct extensive experiments on several standard datasets across four languages and achieve better results than previous approaches. Through experimental analysis, it is found that our proposed model can better exploit the structured information on the dependency tree to improve the recognition of long entities.", "track": "Information Extraction", "label": 5}, {"loc": [2.1043202877044678, 7.574521064758301], "id": 1997, "title": "Title2Event: Benchmarking Open Event Extraction with a Large-scale Chinese Title Dataset", "authors": "Haolin Deng, Yanan Zhang, Yangfan Zhang, Wangyang Ying, Changlong Yu, Jun Gao, Wei Wang, Xiaoling Bai, Nan Yang, Jin Ma, xiang chen and tianhua zhou", "abstract": "Event extraction (EE) is crucial to downstream tasks such as new aggregation and event knowledge graph construction. Most existing EE datasets manually define fixed event types and design specific schema for each of them, failing to cover diverse events emerging from the online text. Moreover, news titles, an important source of event mentions, have not gained enough attention in current EE research. In this paper, we present Title2Event, a large-scale sentence-level dataset benchmarking Open Event Extraction without restricting event types. Title2Event contains more than 42,000 news titles in 34 topics collected from Chinese web pages. To the best of our knowledge, it is currently the largest manually annotated Chinese dataset for open event extraction. We further conduct experiments on Title2Event with different models and show that the characteristics of titles make it challenging for event extraction, addressing the significance of advanced study on this problem. The dataset and baseline codes are available at https://open-event-hub.github.io/title2event.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.632648468017578, 5.509811878204346], "id": 2018, "title": "Cascading Biases: Investigating the Effect of Heuristic Annotation Strategies on Data and Models", "authors": "Chaitanya Malaviya, Sudeep Bhatia and Mark Yatskar", "abstract": "Cognitive psychologists have documented that humans use cognitive heuristics, or mental shortcuts, to make quick decisions while expending less effort. While performing annotation work on crowdsourcing platforms, we hypothesize that such heuristic use among annotators cascades on to data quality and model robustness. In this work, we study cognitive heuristic use in the context of annotating multiple-choice reading comprehension datasets. We propose tracking annotator heuristic traces, where we tangibly measure low-effort annotation strategies that could indicate usage of various cognitive heuristics. We find evidence that annotators might be using multiple such heuristics, based on correlations with a battery of psychological tests. Importantly, heuristic use among annotators determines data quality along several dimensions: (1) known biased models, such as partial input models, more easily solve examples authored\nby annotators that rate highly on heuristic use, (2) models trained on annotators scoring highly on heuristic use don't generalize as well, and (3) heuristic-seeking annotators tend to create qualitatively less challenging examples. Our findings suggest that tracking heuristic usage among annotators can potentially help with collecting challenging datasets and diagnosing model biases.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [3.8742270469665527, 4.28428840637207], "id": 2024, "title": "Teaching Broad Reasoning Skills for Multi-Step QA by Generating Hard Contexts", "authors": "Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot and Ashish Sabharwal", "abstract": "Question-answering datasets require a broad set of reasoning skills. We show how to use question decompositions to teach language models these broad reasoning skills in a robust fashion. Specifically, we use widely available QDMR representations to programmatically create hard-to-cheat synthetic contexts for real questions in six multi-step reasoning datasets. These contexts are carefully designed to avoid common reasoning shortcuts prevalent in real contexts that prevent models from learning the right skills. This results in a pretraining dataset, named TeaBReaC, containing 525K multi-step questions (with associated formal programs) covering about 900 reasoning patterns. We show that pretraining standard language models (LMs) on TeaBReaC before fine-tuning them on target datasets improves their performance by up to 13 F1 points across 4 multi-step QA datasets, with up to 21 point gain on more complex questions. The resulting models also demonstrate higher robustness, with a 5-8 F1 point improvement on two contrast sets. Furthermore, TeaBReaC pretraining substantially improves model performance and robustness even when starting with numerate LMs pretrained using recent methods (e.g., PReasM, POET). Our work thus shows how to effectively use decomposition-guided contexts to robustly teach multi-step reasoning.", "track": "Question Answering", "label": 11}, {"loc": [7.597898006439209, 3.6674375534057617], "id": 2038, "title": "ADDMU: Detection of Far-Boundary Adversarial Examples with Data and Model Uncertainty Estimation", "authors": "Fan Yin, Yao Li, Cho-Jui Hsieh and Kai-Wei Chang", "abstract": "Adversarial Examples Detection (AED) is a crucial defense technique against adversarial attacks and has drawn increasing attention from the Natural Language Processing (NLP) community. Despite the surge of new AED methods, our studies show that existing methods heavily rely on a shortcut to achieve good performance. In other words, current search-based adversarial attacks in NLP stop once model predictions change, and thus most adversarial examples generated by those attacks are located near model decision boundaries. To surpass this shortcut and fairly evaluate AED methods, we propose to test AED methods with \\textbf{F}ar \\textbf{B}oundary (\\textbf{FB}) adversarial examples. Existing methods show worse than random guess performance under this scenario. To overcome this limitation, we propose a new technique, \\textbf{ADDMU}, \\textbf{a}dversary \\textbf{d}etection with \\textbf{d}ata and \\textbf{m}odel \\textbf{u}ncertainty, which combines two types of uncertainty estimation for both regular and FB adversarial example detection. Our new method outperforms previous methods by 3.6 and 6.0 \\emph{AUC} points under each scenario. Finally, our analysis shows that the two types of uncertainty provided by \\textbf{ADDMU} can be leveraged to characterize adversarial\nexamples and identify the ones that contribute most to model's robustness in adversarial training.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.273724555969238, 8.698738098144531], "id": 2040, "title": "G-MAP: General Memory-Augmented Pre-trained Language Model for Domain Tasks", "authors": "Zhongwei Wan, Yichun Yin, Wei Zhang, Jiaxin Shi, Lifeng Shang, Guangyong Chen, Xin Jiang and Qun Liu", "abstract": "General pre-trained language models (PLMs), such as BERT, have achieved remarkable performance on various NLP tasks. Recently, domain-specific PLMs have been proposed to boost the task performance of specific domains (e.g., biomedical and computer science) by continuing to pre-train general PLMs with domain-specific corpora. However, this domain-adaptive pre-training (DAPT \\cite{DBLP:conf/acl/GururanganMSLBD20}) tends to forget the previous general knowledge acquired by general PLMs, which leads to a \\emph{catastrophic forgetting} phenomenon and sub-optimal performance. To alleviate this problem, we propose a new framework of \\textbf{M}emory-\\textbf{A}ugmented \\textbf{P}re-trained Language Model (\\textbf{MAP}), which augments the domain-specific PLM by a memory built from the frozen general PLM without losing the general knowledge. Specifically, we propose a new memory-augmented layer, and based on it, different augmentation strategies are explored to build memory and fusion memory into domain-specific PLM. We demonstrate the effectiveness of MAP on different domains (biomedical and computer science publications, news, and reviews) and different kinds (text classification, QA, NER) of tasks, and the extensive results show that the proposed MAP can achieve SOTA results on these tasks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.497122764587402, 12.16763973236084], "id": 2051, "title": "Towards Unifying Reference Expression Generation and Comprehension", "authors": "Duo Zheng, Tao Kong, Ya Jing, Jiaan Wang and Xiaojie WANG", "abstract": "Reference Expression Generation (REG) and Comprehension (REC) are two highly correlated tasks. Modeling REG and REC simultaneously for utilizing the relation between them is a promising way to improve both. However, the problem of distinct inputs, as well as building connections between them in a single model, brings challenges to the design and training of the joint model. To address the problems, we propose a unified model for REG and REC, named UniRef. It unifies these two tasks with the carefully-designed Image-Region-Text Fusion layer (IRTF), which fuses the image, region and text via the image cross-attention and region cross-attention. Additionally, IRTF could generate pseudo input regions for the REC task to enable a uniform way for sharing the identical representation space across the REC and REG. We further propose Vision-conditioned Masked Language Modeling (VMLM) and Text-Conditioned Region Prediction (TRP) to pre-train UniRef model on multi-granular corpora. The VMLM and TRP are directly related to REG and REC, respectively, but could help each other. We conduct extensive experiments on three benchmark datasets, RefCOCO, RefCOCO+ and RefCOCOg. Experimental results show that our model outperforms previous state-of-the-art methods on both REG and REC.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.579550266265869, 3.6938905715942383], "id": 2053, "title": "Textual Manifold-based Defense Against Natural Language Adversarial Examples", "authors": "Dang Nguyen Minh and Anh Tuan Luu", "abstract": "Despite the recent success of large pretrained language models in NLP, they are susceptible to adversarial examples. Concurrently, several studies on adversarial images have observed an intriguing property: the adversarial images tend to leave the low-dimensional natural data manifold. In this study, we find a similar phenomenon occurs in the contextualized embedding space of natural sentences induced by pretrained language models in which textual adversarial examples tend to have their embeddings diverge off the manifold of natural sentence embeddings. Based on this finding, we propose Textual Manifold-based Defense (TMD), a defense mechanism that learns the embedding space manifold of the underlying language model and projects novel inputs back to the approximated structure before classification. Through extensive experiments, we find that our method consistently and significantly outperforms previous defenses under various attack settings while remaining unaffected to the clean accuracy. To the best of our knowledge, this is the first kind of manifold-based defense adapted to the NLP domain.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.69739818572998, 8.289453506469727], "id": 2071, "title": "Tiny-Attention Adapter: Contexts Are More Important Than the Number of Parameters", "authors": "Hongyu Zhao, Hao Tan and Hongyuan Mei", "abstract": "Adapter-tuning is a paradigm that transfers a pretrained language model to downstream tasks by adding and tuning a small number of new parameters. Previously proposed adapter architectures are all feed-forward neural networks. In this paper, we investigate the effectiveness of using tiny-attention---i.e., attention with extremely small per-head dimensionality---as adapters. Our tiny-attention adapter learns to modify the hidden states at each position directly conditioned on the hidden states at all the other positions, which is missed by the previously proposed adapters. Moreover, we view its multiple attention heads as a mixture of experts and propose to average their weights during deployment, which further reduces its inference computation cost. On the GLUE benchmark, our tiny-attention adapter outperforms the other parameter-efficient transfer learning methods as well as full fine-tuning while only updating 0.05% of the parameters. On the FewGLUE benchmark, its performance is comparable to that of GPT-3 and PET.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [9.85632038116455, 7.325463771820068], "id": 2072, "title": "Reduce Catastrophic Forgetting of Dense Retrieval Training with Teleportation Negatives", "authors": "Si Sun, Chenyan Xiong, Yue Yu, Arnold Overwijk, Zhiyuan Liu and Jie Bao", "abstract": "In this paper, we investigate the instability in the standard dense retrieval training, which iterates between model training and hard negative selection using the being-trained model. We show the catastrophic forgetting phenomena behind the training instability, where models learn and forget different negative groups during training iterations. We then propose ANCE-Tele, which accumulates momentum negatives from past iterations and approximates future iterations using lookahead negatives, as \"teleportations\" along the time axis to smooth the learning process. On web search and OpenQA, ANCE-Tele outperforms previous state-of-the-art systems of similar size, eliminates the dependency on sparse retrieval negatives, and is competitive among systems using significantly more (50x) parameters. Our analysis demonstrates that teleportation negatives reduce catastrophic forgetting and improve convergence speed for dense retrieval training. The source code of this paper is available at https://github.com/OpenMatch/ANCE-Tele.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.951879501342773, 8.736644744873047], "id": 2076, "title": "ATTEMPT: Parameter-Efficient Multi-task Tuning via Attentional Mixtures of Soft Prompts", "authors": "Akari Asai, Mohammadreza Salehi, Matthew Peters and Hannaneh Hajishirzi", "abstract": "This work introduces a new multi-task, parameter-efficient language model (LM) tuning method that learns to transfer knowledge across different tasks via a mixture of soft prompts---small prefix embedding vectors pre-trained for different tasks. Our method, called ATTEMPT (ATTEntional Mixtures of Prompt Tuning), obtains source prompts as encodings of large-scale source tasks into a small number of parameters and trains an attention module to interpolate the source prompts and a newly initialized target prompt for every instance in the target task. During training, only the target task prompt and the attention weights, which are shared between tasks in multi-task training, are updated, while the original LM and source prompts are intact. ATTEMPT is highly parameter-efficient (e.g., updates 2,300 times fewer parameters than full fine-tuning), while it overcomes instability of prompt tuning and achieves high task performance using learned knowledge from high-resource tasks. Moreover, it is modular using pre-trained soft prompts, and can flexibly add or remove source prompts for effective knowledge transfer. Our experimental results across 21 diverse NLP datasets show that ATTEMPT significantly outperforms prompt tuning and outperforms or matches fully fine-tuned or other parameter-efficient tuning approaches that use 10 times more parameters. Finally, ATTEMPT outperforms previous work in few-shot learning settings.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [5.172458171844482, 12.411921501159668], "id": 2077, "title": "Exploration of the Usage of Color Terms by Color-blind Participants in Online Discussion Platforms", "authors": "Ella Rabinovich and Boaz Carmeli", "abstract": "Prominent questions about the role of sensory vs. linguistic input in the way we acquire and use language have been extensively studied in the psycholinguistic literature. However, the relative effect of various factors in a person's overall experience on their linguistic system remains unclear. We study this question by making a step forward towards a better understanding of the conceptual perception of colors by color-blind individuals, as reflected in their spontaneous linguistic productions. Using a novel and carefully curated dataset, we show that red-green color-blind speakers use the \"red\" and \"green\" color terms in less predictable contexts, and in linguistic environments evoking mental image to a lower extent, when compared to their normal-sighted counterparts. These findings shed some new and interesting light on the role of sensory experience on our linguistic system.", "track": "Ethic Concerns:Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [0.6542259454727173, 7.624789714813232], "id": 2079, "title": "DEER: Descriptive Knowledge Graph for Explaining Entity Relationships", "authors": "Jie Huang, Kerui Zhu, Kevin Chang, Jinjun Xiong and Wen-mei Hwu", "abstract": "We propose DEER (Descriptive Knowledge Graph for Explaining Entity Relationships) - an open and informative form of modeling entity relationships. In DEER, relationships between entities are represented by free-text relation descriptions. For instance, the relationship between entities of machine learning and algorithm can be represented as ``Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.'' To construct DEER, we propose a self-supervised learning method to extract relation descriptions with the analysis of dependency patterns and generate relation descriptions with a transformer-based relation description synthesizing model, where no human labeling is required. Experiments demonstrate that our system can extract and generate high-quality relation descriptions for explaining entity relationships. The results suggest that we can build an open and informative knowledge graph without human annotation.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.232655048370361, 7.570635795593262], "id": 2081, "title": "META-GUI: Towards Multi-modal Conversational Agents on Mobile GUI", "authors": "Liangtai Sun, Xingyu Chen, Lu Chen, Tianle Dai, Zichen Zhu and Kai Yu", "abstract": "Task-oriented dialogue (TOD) systems have been widely used by mobile phone intelligent assistants to accomplish tasks such as calendar scheduling or hotel reservation. Current TOD systems usually focus on multi-turn text/speech interaction, then they would call back-end APIs designed for TODs to perform the task. However, this API-based architecture greatly limits the information-searching capability of intelligent assistants and may even lead to task failure if TOD-specific APIs are not available or the task is too complicated to be executed by the provided APIs. In this paper, we propose a new TOD architecture: GUI-based task-oriented dialogue system (GUI-TOD). A GUI-TOD system can directly perform GUI operations on real APPs and execute tasks without invoking TOD-specific backend APIs. Furthermore, we release META-GUI, a dataset for training a Multi-modal convErsaTional Agent on mobile GUI. We also propose a multi-model action prediction and response model, which show promising results on META-GUI. The dataset, codes and leaderboard are publicly available.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [8.467928886413574, 7.9325056076049805], "id": 2085, "title": "Understanding and Improving Knowledge Distillation for Quantization Aware Training of Large Transformer Encoders", "authors": "Minsoo Kim, Sihwa Lee, Suk-Jin Hong, Du-Seong Chang and Jungwook Choi", "abstract": "Knowledge distillation (KD) has been a ubiquitous method for model compression to strengthen the capability of a lightweight model with the transferred knowledge from the teacher. In particular, KD has been employed in quantization-aware training (QAT) of Transformer encoders like BERT to improve the accuracy of the student model with the reduced-precision weight parameters. However, little is understood about which of the various KD approaches best fits the QAT of Transformers. In this work, we provide an in-depth analysis of the mechanism of KD on attention recovery of quantized large Transformers. In particular, we reveal that the previously adopted MSE loss on the attention score is insufficient for recovering the self-attention information. Therefore, we propose two KD methods; attention-map and attention-output losses. Furthermore, we explore the unification of both losses to address task-dependent preference between attention-map and output losses. The experimental results on various Transformer encoder models demonstrate that the proposed KD methods achieve state-of-the-art accuracy for QAT with sub-2-bit weight quantization.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.805910110473633, 8.075787544250488], "id": 2101, "title": "Exploring Mode Connectivity for Pre-trained Language Models", "authors": "Yujia Qin, Cheng Qian, Jing Yi, Weize Chen, Yankai Lin, Xu Han, Zhiyuan Liu, Maosong Sun and Jie Zhou", "abstract": "Recent years have witnessed the prevalent application of pre-trained language models (PLMs) in NLP. From the perspective of parameter space, PLMs provide generic initialization, starting from which high-performance minima could be found. Although plenty of works have studied how to effectively and efficiently adapt PLMs to high-performance minima, little is known about the connection of various minima reached under different adaptation configurations. In this paper, we investigate the geometric connections of different minima through the lens of mode connectivity, which measures whether two minima can be connected with a low-loss path. We conduct empirical analyses to investigate three questions: (1) how could hyperparameters, specific tuning methods, and training data affect PLM's mode connectivity? (2) How does mode connectivity change during pre-training? (3) How does the PLM's task knowledge change along the path connecting two minima? In general, exploring the mode connectivity of PLMs conduces to understanding the geometric connection of different minima, which may help us fathom the inner workings of PLM downstream adaptation. The codes are publicly available at https://github.com/thunlp/Mode-Connectivity-PLM.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.258220672607422, 6.747270107269287], "id": 2104, "title": "Synergy with Translation Artifacts for Training and Inference in Multilingual Tasks", "authors": "Jaehoon Oh, Jongwoo Ko and Se-Young Yun", "abstract": "Translation has played a crucial role in improving the performance on multilingual tasks: (1) to generate the target language data from the source language data for training and (2) to generate the source language data from the target language data for inference. However, prior works have not considered the use of both translations simultaneously. This paper shows that combining them can synergize the results on various multilingual sentence classification tasks. We empirically find that translation artifacts stylized by translators are the main factor of the performance gain. Based on this analysis, we adopt two training methods, SupCon and MixUp, considering translation artifacts. Furthermore, we propose a cross-lingual fine-tuning algorithm called MUSC, which uses SupCon and MixUp jointly and improves the performance. Our code is available at https://github.com/jongwooko/MUSC.", "track": "Multilinguality", "label": 13}, {"loc": [6.135923862457275, 12.493571281433105], "id": 2105, "title": "Increasing Visual Awareness in Multimodal Neural Machine Translation from an Information Theoretic Perspective", "authors": "Baijun Ji, Tong Zhang, Yicheng Zou, Bojie Hu and si shen", "abstract": "Multimodal machine translation (MMT) aims to improve translation quality by equipping the source sentence with its corresponding image. Despite the promising performance, MMT models still suffer the problem of input degradation: models focus more on textual information while visual information is generally overlooked. In this paper, we endeavor to improve MMT performance by increasing visual awareness from an information theoretic perspective. In detail, we decompose the informative visual signals into two parts: source-specific information and target-specific information. We use mutual information to quantify them and propose two methods for objective optimization to better leverage visual signals. Experiments on two datasets demonstrate that our approach can effectively enhance the visual awareness of MMT model and achieve superior results against strong baselines.", "track": "Machine Translation", "label": 10}, {"loc": [2.221330404281616, 7.7354841232299805], "id": 2114, "title": "Improving Event Coreference Resolution Using Document-level and Topic-level Information", "authors": "Sheng Xu, Peifeng Li and Qiaoming Zhu", "abstract": "Event coreference resolution (ECR) aims to cluster event mentions that refer to the same real-world events. Deep learning methods have achieved SOTA results on the ECR task. However, due to the encoding length limitation, previous methods either adopt classical pairwise models based on sentence-level context or split each document into multiple chunks and encode them separately. They failed to capture the interactions and contextual cues among those long-distance event mentions. Besides, high-level information, such as event topics, is rarely considered to enhance representation learning for ECR. To address the above two issues, we first apply a Longformer-based encoder to obtain the document-level embeddings and an encoder with a trigger-mask mechanism to learn sentence-level embeddings based on local context. In addition, we propose an event topic generator to infer the latent topic-level representations. Finally, using the above event embeddings, we employ a multiple tensor matching method to capture their interactions at the document, sentence, and topic levels. Experimental results on the KBP 2017 dataset show that our model outperforms the SOTA baselines.", "track": "Information Extraction", "label": 5}, {"loc": [7.667898654937744, 8.778020858764648], "id": 2124, "title": "Vector-Quantized Input-Contextualized Soft Prompts for Natural Language Understanding", "authors": "Rishabh Bhardwaj, Amrita Saha, Steven C.H. Hoi and Soujanya Poria", "abstract": "Prompt Tuning has been largely successful as a parameter-efficient method of conditioning large-scale pre-trained language models to perform downstream tasks. Thus far, soft prompt tuning learns a fixed set of task-specific continuous vectors, i.e., soft tokens that remain static across the task samples. A fixed prompt, however, may not generalize well to the diverse kinds of inputs the task comprises. In order to address this, we propose Vector-quantized Input-contextualized Prompts (VIP) as an extension to the soft prompt tuning framework. VIP particularly focuses on two aspects---contextual prompts that learns input-specific contextualization of the soft prompt tokens through a small-scale sentence encoder and quantized prompts that maps the contextualized prompts to a set of learnable codebook vectors through a Vector quantization network. On various language understanding tasks like SuperGLUE, QA, Relation classification, NER and NLI, VIP outperforms the soft prompt tuning (PT) baseline by an average margin of 1.19%. Further, our generalization studies show that VIP learns more robust prompt representations, surpassing PT by a margin of 0.6% - 5.3% on Out-of-domain QA and NLI tasks respectively, and by 0.75% on Multi-Task setup over 4 tasks spanning across 12 domains.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.8026227951049805, 8.669318199157715], "id": 2128, "title": "Boosting Natural Language Generation from Instructions with Meta-Learning", "authors": "Budhaditya Deb, Ahmed Hassan Awadallah and Guoqing Zheng", "abstract": "Recent work has shown that language models (LMs) trained with multi-task \\textit{instructional learning} (MTIL) can solve diverse NLP tasks in zero- and few-shot settings with improved performance compared to prompt tuning. MTIL illustrates that LMs can extract and use information about the task from instructions beyond the surface patterns of the inputs and outputs. This suggests that meta-learning may further enhance the utilization of instructions for effective task transfer. In this paper we investigate whether meta-learning applied to MTIL can further improve generalization to unseen tasks in a zero-shot setting. Specifically, we propose to adapt meta-learning to MTIL in three directions: 1) Model Agnostic Meta Learning (MAML), 2) Hyper-Network (HNet) based adaptation to generate task specific parameters conditioned on instructions, and 3) an approach combining HNet and MAML. Through extensive experiments on the large scale Natural Instructions V2 dataset, we show that our proposed approaches significantly improve over strong baselines in zero-shot settings. In particular, meta-learning improves the effectiveness of instructions and is most impactful when the test tasks are strictly zero-shot (i.e. no similar tasks in the training set) and are \"hard\" for LMs, illustrating the potential of meta-learning for MTIL for out-of-distribution tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.019964218139648, 9.293288230895996], "id": 2137, "title": "Topical Segmentation of Spoken Narratives: A Test Case on Holocaust Survivor Testimonies", "authors": "Eitan Wagner, Renana Keydar, Amit Pinchevski and Omri Abend", "abstract": "The task of topical segmentation is well studied, but previous work has mostly addressed it in the context of structured, well-defined segments, such as segmentation into paragraphs, chapters, or segmenting text that originated from multiple sources. We tackle the task of segmenting running (spoken) narratives, which poses hitherto unaddressed challenges. As a test case, we address Holocaust survivor testimonies, given in English. Other than the importance of studying these testimonies for Holocaust research, we argue that they provide an interesting test case for topical segmentation, due to their unstructured surface level, relative abundance (tens of thousands of such testimonies were collected), and the relatively confined domain that they cover. We hypothesize that boundary points between segments correspond to low mutual information between the sentences proceeding and following the boundary. Based on this hypothesis, we explore a range of algorithmic approaches to the task, building on previous work on segmentation that uses generative Bayesian modeling and state-of-the-art neural machinery. Compared to manually annotated references, we find that the developed approaches show considerable improvements over previous work.", "track": "NLP Applications", "label": 0}, {"loc": [10.287161827087402, 6.855659008026123], "id": 2141, "title": "Unifying the Convergences in Multilingual Neural Machine Translation", "authors": "yichong huang, Xiaocheng Feng, Xinwei Geng and Bing Qin", "abstract": "Although all-in-one-model multilingual neural machine translation (MNMT) has achieved remarkable progress, the convergence inconsistency in the joint training is ignored, i.e., different language pairs reaching convergence in different epochs. This leads to the trained MNMT model over-fitting low-resource language translations while under-fitting high-resource ones. In this paper, we propose a novel training strategy named LSSD (LanguageSpecific Self-Distillation), which can alleviate the convergence inconsistency and help MNMT models achieve the best performance on each language pair simultaneously. Specifically, LSSD picks up language-specific best checkpoints for each language pair to teach the current model on the fly. Furthermore, we systematically explore three sample-level manipulations of knowledge transferring. Experimental results on three datasets show that LSSD obtains consistent improvements towards all language pairs and achieves the state-of-the-art.", "track": "Machine Translation", "label": 10}, {"loc": [1.583666443824768, 8.56206226348877], "id": 2153, "title": "Modeling Label Correlations for Ultra-Fine Entity Typing with Neural Pairwise Conditional Random Field", "authors": "Chengyue Jiang, Yong Jiang, Weiqi Wu, Pengjun Xie and Kewei Tu", "abstract": "Ultra-fine entity typing (UFET) aims to predict a wide range of type phrases that correctly describe the categories of a given entity mention in a sentence. \n Most recent works infer each entity type independently, ignoring the correlations between types, e.g., when an entity is inferred as a {\\it president}, it should also be a {\\it politician} and a {\\it leader}.\n To this end, we use an undirected graphical model called pairwise conditional random field (PCRF) to formulate the UFET problem, in which the type variables are not only unarily influenced by the input but also pairwisely relate to all the other type variables.\n We use various modern backbones for entity typing to compute unary potentials, and derive pairwise potentials from type phrase representations that both capture prior semantic information and facilitate accelerated inference. We use mean-field variational inference for efficient type inference on very large type sets and unfold it as a neural network module to enable end-to-end training. \n Experiments on UFET show that the Neural-PCRF consistently outperforms its backbones with little cost and results in a competitive performance against cross-encoder based SOTA while being \\emph{thousands of times} faster. We also find Neural-PCRF effective on a widely used fine-grained entity typing dataset with a smaller type set. We pack Neural-PCRF as a network module that can be plugged onto multi-label type classifiers with ease and release it in \\code.", "track": "Information Extraction", "label": 5}, {"loc": [5.128573894500732, 8.847064018249512], "id": 2155, "title": "Help me write a Poem - Instruction Tuning as a Vehicle for Collaborative Poetry Writing", "authors": "Tuhin Chakrabarty, Vishakh Padmakumar and He He", "abstract": "Recent work in training large language models (LLMs) to follow natural language instructions has opened up exciting opportunities for natural language interface design. Building on the prior success of large language models in the realm of computer assisted creativity, in this work, we present \\textit{CoPoet}, a collaborative poetry writing system, with the goal of to study if LLM's actually improve the quality of the generated content. In contrast to auto-completing a user's text, CoPoet is controlled by user instructions that specify the attributes of the desired text, such as \\textit{Write a sentence about 'love'} or \\textit{Write a sentence ending in 'fly'}. The core component of our system is a language model fine-tuned on a diverse collection of instructions for poetry writing. Our model is not only competitive to publicly available LLMs trained on instructions (InstructGPT), but also capable of satisfying unseen compositional instructions. A study with 15 qualified crowdworkers shows that users successfully write poems with CoPoet on diverse topics ranging from \\textit{Monarchy} to \\textit{Climate change}, which are preferred by third-party evaluators over poems written without the system.", "track": "Natural Language Generation", "label": 6}, {"loc": [1.1655627489089966, 7.677905559539795], "id": 2157, "title": "Open Relation and Event Type Discovery with Type Abstraction", "authors": "Sha Li, Heng Ji and Jiawei Han", "abstract": "Conventional \"closed-world\" information extraction (IE) approaches rely on human ontologies to define the scope for extraction. As a result, such approaches fall short when applied to new domains. This calls for systems that can automatically infer new types from given corpora, a task which we refer to as type discovery.\nTo tackle this problem, we introduce the idea of type abstraction, where the model is prompted to generalize and name the type. Then we use the similarity between inferred names to induce clusters. Observing that this abstraction-based representation is often complementary to the entity/trigger token representation, we set up these two representations as two views and design our model as a co-training framework. \nOur experiments on multiple relation extraction and event extraction datasets consistently show the advantage of our type abstraction approach.", "track": "Information Extraction", "label": 5}, {"loc": [0.7691812515258789, 6.9129719734191895], "id": 2160, "title": "Enhancing Multilingual Language Model with Massive Multilingual Knowledge Triples", "authors": "Linlin Liu, Xin Li, Ruidan He, Lidong Bing, Shafiq Joty and Luo Si", "abstract": "Knowledge-enhanced language representation learning has shown promising results across various knowledge-intensive NLP tasks. However, prior methods are limited in efficient utilization of multilingual knowledge graph (KG) data for language model (LM) pretraining. They often train LMs with KGs in indirect ways, relying on extra entity/relation embeddings to facilitate knowledge injection. In this work, we explore methods to make better use of the multilingual annotation and language agnostic property of KG triples, and present novel knowledge based multilingual language models (KMLMs) trained directly on the knowledge triples. We first generate a large amount of multilingual synthetic sentences using the Wikidata KG triples. Then based on the intra- and inter-sentence structures of the generated data, we design pretraining tasks to enable the LMs to not only memorize the factual knowledge but also learn useful logical patterns. Our pretrained KMLMs demonstrate significant performance improvements on a wide range of knowledge-intensive cross-lingual tasks, including named entity recognition (NER), factual knowledge retrieval, relation classification, and a newly designed logical reasoning task.", "track": "Multilinguality", "label": 13}, {"loc": [10.777545928955078, 9.305002212524414], "id": 2164, "title": "Revisiting Grammatical Error Correction Evaluation and Beyond", "authors": "Peiyuan Gong, Xuebo Liu, Heyan Huang and Min Zhang", "abstract": "Pretraining-based (PT-based) automatic evaluation metrics (e.g., BERTScore and BARTScore) have been widely used in several sentence generation tasks (e.g., machine translation and text summarization) due to their better correlation with human judgments over traditional overlap-based methods. Although PT-based methods have become the de facto standard for training grammatical error correction (GEC) systems, GEC evaluation still does not benefit from pretrained knowledge. This paper takes the first step towards understanding and improving GEC evaluation with pretraining. We first find that arbitrarily applying PT-based metrics to GEC evaluation brings unsatisfactory correlation results because of the excessive attention to inessential systems outputs (e.g., unchanged parts). To alleviate the limitation, we propose a novel GEC evaluation metric to achieve the best of both worlds, namely PT-M2 which only uses PT-based metrics to score those corrected parts. Experimental results on the CoNLL14 evaluation task show that PT-M2 significantly outperforms existing methods, achieving a new state-of-the-art result of 0.949 Pearson correlation. Further analysis reveals that PT-M2 is robust to evaluate competitive GEC systems. Source code and scripts are freely available at https://github.com/pygongnlp/PT-M2.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.695117950439453, 8.577905654907227], "id": 2166, "title": "R2D2: Robust Data-to-Text with Replacement Detection", "authors": "Linyong Nan, Lorenzo Jaime Flores, Yilun Zhao, Yixin Liu, Luke Benson, Weijin Zou and Dragomir Radev", "abstract": "Unfaithful text generation is a common problem for text generation systems. In the case of Data-to-Text (D2T) systems, the factuality of the generated text is particularly crucial for any real-world applications. We introduce R2D2, a training framework that addresses unfaithful Data-to-Text generation by training a system both as a generator and a faithfulness discriminator with additional replacement detection and unlikelihood learning tasks. To facilitate such training, we propose two methods for sampling unfaithful sentences. We argue that the poor entity retrieval capability of D2T systems is one of the primary sources of unfaithfulness, so in addition to the existing metrics, we further propose named entity based metrics to evaluate the fidelity of D2T generations. Our experimental results show that R2D2 systems could effectively mitigate the unfaithful text generation, and they achieve new state-of-theart results on FeTaQA, LogicNLG, and ToTTo, all with significant improvements.", "track": "Natural Language Generation", "label": 6}, {"loc": [2.8934531211853027, 4.71160364151001], "id": 2174, "title": "IDK-MRC: Unanswerable Questions for Indonesian Machine Reading Comprehension", "authors": "Rifki Afina Putri and Alice Oh", "abstract": "Machine Reading Comprehension (MRC) has become one of the essential tasks in Natural Language Understanding (NLU) as it is often included in several NLU benchmarks (Liang et al., 2020; Wilie et al., 2020). However, most MRC datasets only have answerable question type, overlooking the importance of unanswerable questions. MRC models trained only on answerable questions will select the span that is most likely to be the answer, even when the answer does not actually exist in the given passage (Rajpurkar et al., 2018). This problem especially remains in medium- to low-resource languages like Indonesian. Existing Indonesian MRC datasets (Purwarianti et al., 2007; Clark et al., 2020) are still inadequate because of the small size and limited question types, i.e., they only cover answerable questions. To fill this gap, we build a new Indonesian MRC dataset called I(n)don'tKnow- MRC (IDK-MRC) by combining the automatic and manual unanswerable question generation to minimize the cost of manual dataset construction while maintaining the dataset quality. Combined with the existing answerable questions, IDK-MRC consists of more than 10K questions in total. Our analysis shows that our dataset significantly improves the performance of Indonesian MRC models, showing a large improvement for unanswerable questions.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.57345199584961, 7.651828289031982], "id": 2190, "title": "XLM-D: Decorate Cross-lingual Pre-training Model as Non-Autoregressive Neural Machine Translation", "authors": "Yong Wang, Shilin He, Guanhua Chen, Yun Chen and Daxin Jiang", "abstract": "Pre-training language models have achieved thriving success in numerous natural language understanding and autoregressive generation tasks, but non-autoregressive generation in applications such as machine translation has not sufficiently benefited from the pre-training paradigm. In this work, we establish the connection between a pre-trained masked language model (MLM) and non-autoregressive generation on machine translation. From this perspective, we present XLM-D, which seamlessly transforms an off-the-shelf cross-lingual pre-training model into a non-autoregressive translation (NAT) model with a lightweight yet effective decorator. Specifically, the decorator ensures the representation consistency of the pre-trained model and brings only one additional trainable parameter. Extensive experiments on typical translation datasets show that our models obtain state-of-the-art performance while realizing the inference speed-up by 19.9x. One striking result is that on WMT14 En-De, our XLM-D obtains 29.80 BLEU points with multiple iterations, which outperforms the previous mask-predict model by 2.77 points.", "track": "Machine Translation", "label": 10}, {"loc": [0.6945322155952454, 7.86568546295166], "id": 2212, "title": "Cross-stitching Text and Knowledge Graph Encoders for Distantly Supervised Relation Extraction", "authors": "Qin Dai, Benjamin Heinzerling and Kentaro Inui", "abstract": "Bi-encoder architectures for distantly-supervised relation extraction are designed to make use of the complementary information found in text and knowledge graphs (KG).\nHowever, current architectures suffer from two drawbacks. They either do not allow any sharing between the text encoder and the KG encoder at all, or, in case of models with KG-to-text attention, only share information in one direction. Here, we introduce cross-stitch bi-encoders, which allow full interaction between the text encoder and the KG encoder via a cross-stitch mechanism. The cross-stitch mechanism allows sharing and updating representations between the two encoders at any layer, with the amount of sharing being dynamically controlled via cross-attention-based gates. Experimental results on two relation extraction benchmarks from two different domains show that enabling full interaction between the two encoders yields strong improvements.", "track": "Information Extraction", "label": 5}, {"loc": [5.682833194732666, 11.834573745727539], "id": 2217, "title": "Assist Non-native Viewers: Multimodal Cross-Lingual Summarization for How2 Videos", "authors": "Nayu Liu, Kaiwen Wei, Xian Sun, Hongfeng Yu, Fanglong Yao, li jin, Guo Zhi and Guangluan Xu", "abstract": "Multimodal summarization for videos aims to generate summaries from multi-source information (videos, audio transcripts), which has achieved promising progress. However, existing works are restricted to monolingual video scenarios, ignoring the demands of non-native video viewers to understand the cross-language videos in practical applications. It stimulates us to propose a new task, named Multimodal Cross-Lingual Summarization for videos (MCLS), which aims to generate cross-lingual summaries from multimodal inputs of videos. First, to make it applicable to MCLS scenarios, we conduct a Video-guided Dual Fusion network (VDF) that integrates multimodal and cross-lingual information via diverse fusion strategies at both encoder and decoder. Moreover, to alleviate the problem of high annotation costs and limited resources in MCLS, we propose a triple-stage training framework to assist MCLS by transferring the knowledge from monolingual multimodal summarization data, which includes: 1) multimodal summarization on sufficient prevalent language videos with a VDF model; 2) knowledge distillation (KD) guided adjustment on bilingual transcripts; 3) multimodal summarization for cross-lingual videos with a KD induced VDF model. Experiment results on the reorganized How2 dataset show that the VDF model alone outperforms previous methods for multimodal summarization, and the performance further improves by a large margin via the proposed triple-stage training framework.", "track": "Summarization", "label": 14}, {"loc": [2.7172839641571045, 4.711459159851074], "id": 2223, "title": "PACIFIC: Towards Proactive Conversational Question Answering over Tabular and Textual Data in Finance", "authors": "Yang Deng, Wenqiang Lei, Wenxuan Zhang, Wai Lam and Tat-Seng Chua", "abstract": "To facilitate conversational question answering (CQA) over hybrid contexts in finance, we present a new dataset, named PACIFIC. Compared with existing CQA datasets, PACIFIC exhibits three key features: (i) proactivity, (ii) numerical reasoning, and (iii) hybrid context of tables and text. A new task is defined accordingly to study Proactive Conversational Question Answering (PCQA), which combines clarification question generation and CQA. In addition, we propose a novel method, namely UniPCQA, to adapt a hybrid format of input and output content in PCQA into the Seq2Seq problem, including the reformulation of the numerical reasoning process as code generation. UniPCQA performs multi-task learning over all sub-tasks in PCQA and incorporates a simple ensemble strategy to alleviate the error propagation issue in the multi-task learning by cross-validating top-$k$ sampled Seq2Seq outputs. We benchmark the PACIFIC dataset with extensive baselines and provide comprehensive evaluations on each sub-task of PCQA.", "track": "Question Answering", "label": 11}, {"loc": [4.354979515075684, 5.935750961303711], "id": 2245, "title": "Generative Data Augmentation with Contrastive Learning for Zero-Shot Stance Detection", "authors": "Yang Li and Jiawei Yuan", "abstract": "Stance detection aims to identify whether the author of an opinionated text is in favor of, against, or neutral towards a given target. \nRemarkable success has been achieved when sufficient labeled training data is available. \nHowever, it is labor-intensive to annotate sufficient data and train the model for every new target.\nTherefore, zero-shot stance detection, aiming at identifying stances of unseen targets with seen targets, has gradually attracted attention. \nAmong them, one of the important challenges is to reduce the domain transfer between seen and unseen targets. \nTo tackle this problem, we propose a generative data augmentation approach to generate training samples containing targets and stances for testing data, and map the real samples and generated synthetic samples into the same embedding space with contrastive learning, then perform the final classification based on the augmented data. We evaluate our proposed model on two benchmark datasets. Experimental results show that our approach achieves state-of-the-art performance on most topics in the task of zero-shot stance detection.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [0.8079336285591125, 7.561835765838623], "id": 2249, "title": "Better Few-Shot Relation Extraction with Label Prompt Dropout", "authors": "Peiyuan Zhang and Wei Lu", "abstract": "Few-shot relation extraction aims to learn to identify the relation between two entities based on very limited training examples. Recent efforts found that textual labels (i.e., relation names and relation descriptions) could be extremely useful for learning class representations, which will benefit the few-shot learning task. However, what is the best way to leverage such label information in the learning process is an important research question. Existing works largely assume such textual labels are always present during both learning and prediction. In this work, we argue that such approaches may not always lead to optimal results. Instead, we present a novel approach called label prompt dropout, which randomly removes label descriptions in the learning process. Our experiments show that our approach is able to lead to improved class representations, yielding significantly better results on the few-shot relation extraction task.", "track": "Information Extraction", "label": 5}, {"loc": [8.652482986450195, 6.012907981872559], "id": 2251, "title": "Break it Down into BTS: Basic, Tiniest Subword Units for Korean", "authors": "Nayeon Kim, Jun-Hyung Park, Joon-Young Choi, Eojin JEJ Jeon, Youjin Kang and SangKeun Lee", "abstract": "We introduce Basic, Tiniest Subword (BTS) units for the Korean language, which are inspired by the invention principle of Hangeul, the Korean writing system. Instead of relying on 51 Korean consonant and vowel letters, we form the letters from BTS units by adding strokes or combining them. To examine the impact of BTS units on Korean language processing, we develop a novel BTS-based word embedding framework that is readily applicable to various models. Our experiments reveal that BTS units significantly improve the performance of Korean word embedding on all intrinsic and extrinsic tasks in our evaluation. In particular, BTS-based word embedding outperforms the state-of-theart Korean word embedding by 11.8% in word analogy. We further investigate the unique advantages provided by BTS units through indepth analysis.", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [8.556219100952148, 7.872621536254883], "id": 2271, "title": "The Devil in Linear Transformer", "authors": "Zhen Qin, XiaoDong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes and Yiran Zhong", "abstract": "Linear transformers aim to reduce the quadratic space-time complexity of vanilla transformers. However, they usually suffer from degraded performances on various tasks and corpus. In this paper, we examine existing kernel-based linear transformers and identify two key issues that lead to such performance gaps: 1) unbounded gradients in the attention computation adversely impact the convergence of linear transformer models; 2) attention dilution which trivially distributes attention scores over long sequences while neglecting neighbouring structures. To address these issues, we first identify that the scaling of attention matrices is the devil in unbounded gradients, which turns out unnecessary in linear attention as we show theoretically and empirically. To this end, we propose a new linear attention that replaces the scaling operation with a normalization to stabilize gradients. For the issue of attention dilution, we leverage a diagonal attention to confine attention to only neighbouring tokens in early layers. Benefiting from the stable gradients and improved attention, our new linear transformer model, transNormer, demonstrates superior performance on text classification and language modeling tasks, as well as on the challenging Long-Range Arena benchmark, surpassing vanilla transformer and existing linear variants by a clear margin while being significantly more space-time efficient. The code is available at https://github.com/OpenNLPLab/Transnormer .", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.997725009918213, 9.699125289916992], "id": 2273, "title": "Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective", "authors": "Ping Yang, Junjie Wang, Ruyi Gan, Xinyu Zhu, Lin Zhang, Ziwei Wu, Xinyu Gao, Jiaxing Zhang and Tetsuya Sakai", "abstract": "We propose a new paradigm for zero-shot learners that is format agnostic, i.e., it is compatible with any format and applicable to a list of language tasks, such as text classification, commonsense reasoning, coreference resolution, and sentiment analysis. Zero-shot learning aims to train a model on a given task such that it can address new learning tasks without any additional training. Our approach converts zero-shot learning into multiple-choice tasks, avoiding problems in commonly used large-scale generative models such as FLAN. It not only adds generalization ability to models but also significantly reduces the number of parameters. Our method shares the merits of efficient training and deployment. Our approach shows state-of-the-art performance on several benchmarks and produces satisfactory results on tasks such as natural language inference and text classification. Our model achieves this success with only 235M parameters, which is substantially smaller than state-of-the-art models with billions of parameters. The code and pre-trained models are available at https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen/examples/unimc .", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.549112319946289, 7.258500099182129], "id": 2280, "title": "Hypoformer: Hybrid Decomposition Transformer for Edge-friendly Neural Machine Translation", "authors": "sunzhu li, Peng Zhang, Guobing Gan, Lv Xiuqing, Benyou Wang, Junqiu Wei and Xin Jiang", "abstract": "Transformer has been demonstrated effective in Neural Machine Translation (NMT). However, it is memory-consuming and time-consuming in edge devices, resulting in some difficulties for real-time feedback. To compress and accelerate Transformer, we propose a Hybrid Tensor-Train (HTT) decomposition, which retains full rank and meanwhile reduces operations and parameters. \nA Transformer using HTT, named Hypoformer, consistently and notably outperforms the recent light-weight SOTA methods on three standard translation tasks under different parameter and speed scales. In extreme low resource scenarios, Hypoformer has 7.1 points absolute improvement in BLEU and 1.27 X speedup than vanilla Transformer on IWSLT'14 De-En task.", "track": "Machine Translation", "label": 10}, {"loc": [6.460926532745361, 12.332314491271973], "id": 2281, "title": "FigMemes: A Dataset for Figurative Language Identification in Politically-Opinionated Memes", "authors": "Chen Liu, Gregor Geigle, Robin Krebs and Iryna Gurevych", "abstract": "Real-world politically-opinionated memes often rely on figurative language to cloak propaganda and radical ideas to help them spread. It is not only a scientific challenge to develop machine learning models to recognize them in memes, but also sociologically beneficial to understand hidden meanings at scale and raise awareness. These memes are fast-evolving (in both topics and visuals) and it remains unclear whether current multimodal machine learning models are robust to such distribution shifts. To enable future research into this area, we first present FigMemes, a dataset for figurative language classification in politically-opinionated memes. We evaluate the performance of state-of-the-art unimodal and multimodal models and provide comprehensive benchmark results. The key contributions of this proposed dataset include annotations of six commonly used types of figurative language in politically-opinionated memes, and a wide range of topics and visual styles.\nWe also provide analyses on the ability of multimodal models to generalize across distribution shifts in memes. Our dataset poses unique machine learning challenges and our results show that current models have significant room for improvement in both performance and robustness to distribution shifts.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.6469632983207703, 7.781680107116699], "id": 2289, "title": "UniRel: Unified Representation and Interaction for Joint Relational Triple Extraction", "authors": "Wei Tang, Benfeng Xu, Yuyue Zhao, Zhendong Mao, Yifeng Liu, Yong Liao and Haiyong Xie", "abstract": "Relational triple extraction is challenging for its difficulty in capturing rich correlations between entities and relations. Existing works suffer from 1) heterogeneous representations of entities and relations, and 2) heterogeneous modeling of entity-entity interactions and entity-relation interactions. Therefore, the rich correlations are not fully exploited by existing works. In this paper, we propose UniRel to address these challenges. Specifically, we unify the representations of entities and relations by jointly encoding them within a concatenated natural language sequence, and unify the modeling of interactions with a proposed Interaction Map, which is built upon the off-the-shelf self-attention mechanism within any Transformer block. With comprehensive experiments on two popular relational triple extraction datasets, we demonstrate that UniRel is more effective and computationally efficient. The source code is available at https://github.com/wtangdev/UniRel.", "track": "Information Extraction", "label": 5}, {"loc": [3.7879815101623535, 9.520407676696777], "id": 2290, "title": "X-FACTOR: A Cross-metric Evaluation of Factual Correctness in Abstractive Summarization", "authors": "Subhajit Chaudhury, Sarathkrishna Swaminathan, Chulaka Gunasekara, Maxwell Crouse, Srinivas Ravishankar, Daiki Kimura, Keerthiram Murugesan, Ram\u00f3n Fernandez Astudillo, Tahira Naseem, Pavan Kapanipathi and Alexander Gray", "abstract": "Abstractive summarization models often produce factually inconsistent summaries that are not supported by the original article. Recently, a number of fact-consistent evaluation techniques have been proposed to address this issue; however, a detailed analysis of how these metrics agree with one another has yet to be conducted. In this paper, we present X-FACTOR, a cross-evaluation of three high-performing fact-aware abstractive summarization methods. First, we show that summarization models are often fine-tuned on datasets that contain factually inconsistent summaries and propose a fact-aware filtering mechanism that improves the quality of training data and, consequently, the factuality of these models. Second, we propose a corrector module that can be used to improve the factual consistency of generated summaries. Third, we present a re-ranking technique that samples summary instances from the output distribution of a summarization model and re-ranks the sampled instances based on their factuality. Finally, we provide a detailed cross-metric agreement analysis that shows how tuning a model to output summaries based on a particular factuality metric influences factuality as determined by the other metrics. Our goal in this work is to facilitate research that improves the factuality and faithfulness of abstractive summarization models.", "track": "Summarization", "label": 14}, {"loc": [8.136359214782715, 5.221707820892334], "id": 2302, "title": "ParaTag: A Dataset of Paraphrase Tagging for Fine-Grained Labels, NLG Evaluation, and Data Augmentation", "authors": "Shuohang Wang, Ruochen Xu, Yang Liu, Chenguang Zhu and Michael Zeng", "abstract": "Paraphrase identification has been formulated as a binary classification task to decide whether two sentences hold a paraphrase relationship. Existing paraphrase datasets only annotate a binary label for each sentence pair. However, after a systematical analysis of existing paraphrase datasets, we found that the degree of paraphrase cannot be well characterized by a single binary label. And the criteria of paraphrase are not even consistent within the same dataset. We hypothesize that such issues would limit the effectiveness of paraphrase models trained on these data. To this end, we propose a novel fine-grained paraphrase annotation schema that labels the minimum spans of tokens in a sentence that don't have the corresponding paraphrases in the other sentence. Under this setting, we frame paraphrasing as a sequence tagging task. We collect 30k sentence pairs in English with the new annotation schema, resulting in the ParaTag dataset. In addition to reporting baseline results on ParaTag using state-of-art language models, we show that ParaTag is especially useful for training an automatic scorer for language generation evaluation. Finally, we train a paraphrase generation model from ParaTag and achieve better data augmentation performance on the GLUE benchmark than other public paraphrasing datasets.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.8944175243377686, 9.01243782043457], "id": 2308, "title": "Factual Accuracy is not Enough: Planning Consistent Description Order for Radiology Report Generation", "authors": "Toru Nishino, Yasuhide Miura, Tomoki Taniguchi, Tomoko Ohkuma, Yuki Suzuki, Shoji Kido and Noriyuki Tomiyama", "abstract": "Radiology report generation systems have the potential to reduce the workload of radiologists by automatically describing the findings in medical images.\nTo broaden the application of the report generation system, the system should generate reports that are not only factually accurate but also chronologically consistent, describing images that are presented in time order, that is, the correct order.\nWe employ a planning-based radiology report generation system that generates the overall structure of reports as \"plans'\" prior to generating reports that are accurate and consistent in order.\nAdditionally, we propose a novel reinforcement learning and inference method, Coordinated Planning (CoPlan), that includes a content planner and a text generator to train and infer in a coordinated manner to alleviate the cascading of errors that are often inherent in planning-based models.\nWe conducted experiments with single-phase diagnostic reports in which the factual accuracy is critical and multi-phase diagnostic reports in which the description order is critical.\nOur proposed CoPlan improves the content order score by 5.1 pt in time series critical scenarios and the clinical factual accuracy F-score by 9.1 pt in time series irrelevant scenarios, compared those of the baseline models without CoPlan.", "track": "NLP Applications", "label": 0}, {"loc": [5.1756486892700195, 4.886245250701904], "id": 2317, "title": "FLUTE: Figurative Language Understanding through Textual Explanations", "authors": "Tuhin Chakrabarty, Arkadiy Saakyan, Debanjan Ghosh and Smaranda Muresan", "abstract": "Figurative language understanding has been recently framed as a recognizing textual entailment (RTE) task (a.k.a. natural language inference (NLI)). However, similar to classical RTE/NLI datasets they suffer from spurious correlations and annotation artifacts. To tackle this problem, work on NLI has built explanation-based datasets such as eSNLI, allowing us to probe whether language models are right for the right reasons. Yet no such data exists for figurative language, making it harder to assess genuine understanding of such expressions. To address this issue, we release FLUTE, a dataset of 9,000 figurative NLI instances with explanations, spanning four categories: Sarcasm, Simile, Metaphor, and Idioms. We collect the data through a Human-AI collaboration framework based on GPT-3, crowd workers, and expert annotators. We show how utilizing GPT-3 in conjunction with human annotators (novices and experts) can aid in scaling up the creation of datasets even for such complex linguistic phenomena as figurative language. The baseline performance of the T5 model fine-tuned on FLUTE shows that our dataset can bring us a step closer to developing models that understand figurative language through textual explanations.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.863219738006592, 8.872162818908691], "id": 2335, "title": "Precisely the Point: Adversarial Augmentations for Faithful and Informative Text Generation", "authors": "Wenhao Wu, Wei Li, Jiachen Liu, Xinyan Xiao, Sujian Li and Yajuan Lyu", "abstract": "Though model robustness has been extensively studied in language understanding, the robustness of Seq2Seq generation remains understudied.\nIn this paper, we conduct the first quantitative analysis on the robustness of pre-trained Seq2Seq models. \nWe find that even current SOTA pre-trained Seq2Seq model (BART) is still vulnerable, which leads to significant degeneration in faithfulness and informativeness for text generation tasks.\nThis motivated us to further propose a novel adversarial augmentation framework, namely AdvSeq, for generally improving faithfulness and informativeness of Seq2Seq models via enhancing their robustness. \nAdvSeq automatically constructs two types of adversarial augmentations during training, including implicit adversarial samples by perturbing word representations and explicit adversarial samples by word swapping, both of which effectively improve Seq2Seq robustness.\nExtensive experiments on three popular text generation tasks demonstrate that AdvSeq significantly improves both the faithfulness and informativeness of Seq2Seq generation under both automatic and human evaluation settings.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.276088237762451, 4.522980690002441], "id": 2340, "title": "RLET: A Reinforcement Learning Based Approach for Explainable QA with Entailment Trees", "authors": "Tengxiao Liu, Qipeng Guo, Xiangkun Hu, Yue Zhang, Xipeng Qiu and Zheng Zhang", "abstract": "Interpreting the reasoning process from questions to answers poses a challenge in approaching explainable QA. A recently proposed structured reasoning format, entailment tree, manages to offer explicit logical deductions with entailment steps in a tree structure. To generate entailment trees, prior single pass sequence-to-sequence models lack visible internal decision probability, while stepwise approaches are supervised with extracted single step data and cannot model the tree as a whole. In this work, we propose RLET, a Reinforcement Learning based Entailment Tree generation framework, which is trained utilising the cumulative signals across the whole tree. RLET iteratively performs single step reasoning with sentence selection and deduction generation modules, from which the training signal is accumulated across the tree with elaborately designed aligned reward function that is consistent with the evaluation. To the best of our knowledge, we are the first to introduce RL into the entailment tree generation task. Experiments on three settings of the EntailmentBank dataset demonstrate the strength of using RL framework.", "track": "Question Answering", "label": 11}, {"loc": [5.438992023468018, 5.092628479003906], "id": 2341, "title": "Let the CAT out of the bag: Contrastive Attributed explanations for Text", "authors": "Saneem Ahmed Chemmengath, Amar Prakash Azad, Ronny Luss and Amit Dhurandhar", "abstract": "Contrastive explanations for understanding the behavior of black box models has gained a lot of attention recently as they provide potential for recourse. In this paper, we propose a method Contrastive Attributed explanations for Text (CAT) which provides contrastive explanations for natural language text data with a novel twist as we build and exploit attribute classifiers leading to more semantically meaningful explanations. To ensure that our contrastive generated text has the fewest possible edits with respect to the original text, while also being fluent and close to a human generated contrastive, we resort to a minimal perturbation approach regularized using a BERT language model and attribute classifiers trained on available attributes. We show through qualitative examples and a user study that our method not only conveys more insight because of these attributes, but also leads to better quality (contrastive) text. Quantitatively, we show that our method outperforms other state-of-the-art methods across four data sets on four benchmark metrics.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.7155025005340576, 4.661579608917236], "id": 2342, "title": "monoQA: Multi-Task Learning of Reranking and Answer Extraction for Open-Retrieval Conversational Question Answering", "authors": "Sarawoot Kongyoung, Craig Macdonald and Iadh Ounis", "abstract": "To address the Conversational Question Answering (ORConvQA) task, previous work has considered an effective three-stage architecture, consisting of a retriever, a reranker, and a reader to extract the answers. In order to effectively answer the users' questions, a number of existing approaches have applied multi-task learning, such that the same model is shared between the reranker and the reader. Such approaches also typically tackle reranking and reading as classification tasks. On the other hand, recent text generation models, such as monoT5 and UnifiedQA, have been shown to respectively yield impressive performances in passage reranking and reading. However, no prior work has combined monoT5 and UnifiedQA to share a single text generation model that directly extracts the answers for the users instead of predicting the start/end positions in a retrieved passage. In this paper, we investigate the use of Multi-Task Learning (MTL) to improve performance on the ORConvQA task by sharing the reranker and reader's learned structure in a generative model. In particular, we propose monoQA, which uses a text generation model with multi-task learning for both the reranker and reader. Our model, which is based on the T5 text generation model, is fine-tuned simultaneously for both reranking (in order to improve the precision of the top retrieved passages) and extracting the answer. Our results on the OR-QuAC and OR-CoQA datasets demonstrate the effectiveness of our proposed model, which significantly outperforms existing strong baselines with improvements ranging from +12.31% to +19.51% in MAP and from +5.70% to +23.34% in F1 on all used test sets.", "track": "Question Answering", "label": 11}, {"loc": [5.176757335662842, 8.874801635742188], "id": 2356, "title": "Composing Ci with Reinforced Non-autoregressive Text Generation", "authors": "Yan Song", "abstract": "Composing Ci (also widely known as Song Ci), a special type of classical Chinese poetry, requires to follow particular format once their tune patterns are given. To automatically generate a well-formed Ci, text generation systems should strictly take into account pre-defined rigid formats (e.g., length and rhyme). Yet, most existing approaches regard Ci generation as a conventional sequence-to-sequence task and use autoregressive models, while it is challenging for such models to properly handle the constraints (according to tune patterns) of Ci during the generation process. Moreover, consider that with the format prepared, Ci generation can be operated by an efficient synchronous process, where autoregressive models are limited in doing so since they follow the character-by-character generation protocol. Therefore, in this paper, we propose to compose Ci through a non-autoregressive approach, which not only ensure that the generation process accommodates tune patterns by controlling the rhythm and essential meaning of each sentence, but also allow the model to perform synchronous generation. In addition, we further improve our approach by applying reinforcement learning to the generation process with the rigid constraints of Ci as well as the diversity in content serving as rewards, so as to further maintain the format and content requirement. Experiments on a collected Ci dataset confirm that our proposed approach outperforms strong baselines and previous studies in terms of both automatic evaluation metrics and human judgements.", "track": "Natural Language Generation", "label": 6}, {"loc": [0.684380054473877, 6.527635097503662], "id": 2367, "title": "MetaTKG: Learning Evolutionary Meta-Knowledge for Temporal Knowledge Graph Reasoning", "authors": "Yuwei Xia, Mengqi Zhang, Qiang Liu, Shu Wu and Xiao-Yu Zhang", "abstract": "Reasoning over Temporal Knowledge Graphs (TKGs) aims to predict future facts based on given history. One of the key challenges for prediction is to learn the evolution of facts. Most existing works focus on exploring evolutionary information in history to obtain effective temporal embeddings for entities and relations, but they ignore the variation in evolution patterns of facts, which makes them struggle to adapt to future data with different evolution patterns. Moreover, new entities continue to emerge along with the evolution of facts over time. Since existing models highly rely on historical information to learn embeddings for entities, they perform poorly on such entities with little historical information. To tackle these issues, we propose a novel Temporal Meta-learning framework for TKG reasoning, MetaTKG for brevity. Specifically, our method regards TKG prediction as many temporal meta-tasks, and utilizes the designed Temporal Meta-learner to learn evolutionary meta-knowledge from these meta-tasks. The proposed method aims to guide the backbones to learn to adapt quickly to future data and deal with entities with little historical information by the learned meta-knowledge. Specially, in temporal meta-learner, we design a Gating Integration module to adaptively establish temporal correlations between meta-tasks. Extensive experiments on four widely-used datasets and three backbones demonstrate that our method can greatly improve the performance.", "track": "Information Extraction", "label": 5}, {"loc": [5.438526630401611, 12.231659889221191], "id": 2369, "title": "mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections", "authors": "Chenliang Li, Haiyang Xu, Junfeng Tian, Wei Wang, Ming Yan, Bin Bi, Jiabo Ye, He Chen, Guohai Xu, Zheng Cao, Ji Zhang, Songfang Huang, Fei Huang, Jingren Zhou and Luo Si", "abstract": "Large-scale pre-trained foundation models have been an emerging paradigm for building artificial intelligence (AI) systems, which can be quickly adapted to a wide range of downstream tasks. This paper presents mPLUG, a new vision-language foundation model for both cross-modal understanding and generation. Most existing pre-trained models suffer from inefficiency and linguistic signal overwhelmed by long visual sequences in cross-modal alignment. To address both problems, mPLUG introduces an effective and efficient vision-language architecture with novel cross-modal skip-connections.\n\nmPLUG is pre-trained end-to-end on large-scale image-text pairs with both discriminative and generative objectives. It achieves state-of-the-art results on a wide range of vision-language downstream tasks, including image captioning, image-text retrieval, visual grounding and visual question answering. mPLUG also demonstrates strong zero-shot transferability on vision-language and video-language tasks. The code and pre-trained models are available at https://github.com/alibaba/AliceMind", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.0623579025268555, 7.182478427886963], "id": 2370, "title": "Q-TOD: A Query-driven Task-oriented Dialogue System", "authors": "Xin Tian, Yingzhan Lin, Mengfei Song, Siqi Bao, Fan Wang, Huang He, Shuqi SUN and Hua Wu", "abstract": "Existing pipelined task-oriented dialogue systems usually have difficulties adapting to unseen domains, whereas end-to-end systems are plagued by large-scale knowledge bases in practice. In this paper, we introduce a novel query-driven task-oriented dialogue system, namely Q-TOD. The essential information from the dialogue context is extracted into a query, which is further employed to retrieve relevant knowledge records for response generation. Firstly, as the query is in the form of natural language and not confined to the schema of the knowledge base, the issue of domain adaption is alleviated remarkably in Q-TOD. Secondly, as the query enables the decoupling of knowledge retrieval from the generation, Q-TOD gets rid of the issue of knowledge base scalability. To evaluate the effectiveness of the proposed Q-TOD, we collect query annotations for three publicly available task-oriented dialogue datasets. Comprehensive experiments verify that Q-TOD outperforms strong baselines and establishes a new state-of-the-art performance on these datasets.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.369908809661865, 7.399176597595215], "id": 2374, "title": "dial2vec: Self-Guided Contrastive Learning of Unsupervised Dialogue Embeddings", "authors": "Che Liu, Rui Wang, Junfeng Jiang, Yongbin Li and Fei Huang", "abstract": "In this paper, we introduce the task of learning unsupervised dialogue embeddings.\nTrivial approaches such as combining pre-trained word or sentence embeddings and encoding through pre-trained language models (PLMs) have been shown to be feasible for this task.\nHowever, these approaches typically ignore the conversational interactions between interlocutors, resulting in poor performance.\nTo address this issue, we proposed a self-guided contrastive learning approach named dial2vec.\nDial2vec considers a dialogue as an information exchange process.\nIt captures the interaction patterns between interlocutors and leverages them to guide the learning of the embeddings corresponding to each interlocutor.\nThen the dialogue embedding is obtained by an aggregation of the embeddings from all interlocutors.\nTo verify our approach, we establish a comprehensive benchmark consisting of six widely-used dialogue datasets.\nWe consider three evaluation tasks: domain categorization, semantic relatedness, and dialogue retrieval.\nDial2vec achieves on average 8.7, 9.0, and 14.3 points absolute improvements in terms of purity, Spearman's correlation, and mean average precision (MAP) over the strongest baseline on the three tasks respectively.\nFurther analysis shows that dial2vec obtains informative and discriminative embeddings for both interlocutors under the guidance of the conversational interactions and achieves even better performance when aggregating them through the interlocutor-level pooling strategy.\nAll codes and data are publicly available at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/dial2vec.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.117748260498047, 9.59858512878418], "id": 2379, "title": "WR-One2Set: Towards Well-Calibrated Keyphrase Generation", "authors": "Binbin Xie, Xiangpeng Wei, Baosong Yang, Huan Lin, Jun Xie, Xiaoli Wang, Min Zhang and Jinsong Su", "abstract": "Keyphrase generation aims to automatically generate short phrases summarizing an input document. The recently emerged ONE2SET paradigm (Ye et al., 2021) generates keyphrases as a set and has achieved competitive performance. Nevertheless, we observe serious calibration errors outputted by ONE2SET, especially in the over-estimation of \u2205 token (means \"no corresponding keyphrase\u201d). In this paper, we deeply analyze this limitation and identify two main reasons behind: 1) the parallel generation has to introduce excessive \u2205 as padding tokens into training instances; and 2) the training mechanism assigning target to each slot is unstable and further aggravates the \u2205 token over-estimation. To make the model well-calibrated, we propose WR-ONE2SET which extends ONE2SET with an adaptive instance-level cost Weighting strategy and a target Re-assignment mechanism. The former dynamically penalizes the over-estimated slots for different instances thus smoothing the uneven training distribution. The latter refines the original inappropriate assignment and reduces the supervisory signals of over-estimated slots. Experimental results on commonly-used datasets demonstrate the effectiveness and generality of our proposed paradigm.", "track": "Information Extraction", "label": 5}, {"loc": [8.741852760314941, 6.452250957489014], "id": 2381, "title": "Eeny, meeny, miny, moe. How to choose data for morphological inflection.", "authors": "Saliha Muradoglu and Mans Hulden", "abstract": "Data scarcity is a widespread problem for numerous natural language processing (NLP) tasks within low-resource languages. Within morphology, the labour-intensive task of tagging/glossing data is a serious bottleneck for both NLP and fieldwork. Active learning (AL) aims to reduce the cost of data annotation by selecting data that is most informative for the model. In this paper, we explore four sampling strategies for the task of morphological inflection using a Transformer model: a pair of oracle experiments where data is chosen based on correct/incorrect predictions by the model, model confidence, entropy, and random selection. We investigate the robustness of each sampling strategy across 30 typologically diverse languages, as well as a 10-cycle iteration using Nat\u00fcgu as a case study. Our results show a clear benefit to selecting data based on model confidence. Unsurprisingly, the oracle experiment, which is presented as a proxy for linguist/language informer feedback, shows the most improvement. This is followed closely by low-confidence and high-entropy forms. We also show that despite the conventional wisdom of larger data sets yielding better accuracy, introducing more instances of high-confidence, low-entropy, or forms that the model can already inflect correctly, can reduce model performance.", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [0.7400525212287903, 6.518398284912109], "id": 2390, "title": "An Adaptive Logical Rule Embedding Model for Inductive Reasoning over Temporal Knowledge Graphs", "authors": "Xin Mei, Libin Yang, Xiaoyan Cai and zuowei jiang", "abstract": "Temporal knowledge graphs (TKGs) extrapolation reasoning predicts future events based on historical information, which has great research significance and broad application value. Existing methods can be divided into embedding-based methods and logical rule-based methods. Embedding-based methods rely on learned entity and relation embeddings to make predictions and thus lack interpretability. Logical rule-based methods bring scalability problems due to being limited by the learned logical rules. We combine the two methods to capture deep causal logic by learning rule embeddings, and propose an interpretable model for temporal knowledge graph reasoning called adaptive logical rule embedding model for inductive reasoning (ALRE-IR). ALRE-IR can adaptively extract and assess reasons contained in historical events, and make predictions based on causal logic. Furthermore, we propose a one-class augmented matching loss for optimization. When evaluated on the ICEWS14, ICEWS0515 and ICEWS18 datasets, the performance of ALRE-IR outperforms other state-of-the-art baselines. The results also demonstrate that ALRE-IR still shows outstanding performance when transferred to related dataset with common relation vocabulary, indicating our proposed model has good zero-shot reasoning ability.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [5.366840839385986, 7.528066158294678], "id": 2392, "title": "UniNL: Aligning Representation Learning with Scoring Function for OOD Detection via Unified Neighborhood Learning", "authors": "Yutao Mou, Pei Wang, Keqing He, Yanan Wu, Jingang Wang, Wei Wu and Weiran Xu", "abstract": "Detecting out-of-domain (OOD) intents from user queries is essential for avoiding wrong operations in task-oriented dialogue systems. The key challenge is how to distinguish in-domain (IND) and OOD intents. Previous methods ignore the alignment between representation learning and scoring function, limiting the OOD detection performance. In this paper, we propose a unified neighborhood learning framework (UniNL) to detect OOD intents. Specifically, we design a KNCL objective for representation learning, and introduce a KNN-based scoring function for OOD detection. We aim to align representation learning with scoring function. Experiments and analysis on two benchmark datasets show the effectiveness of our method.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.003041744232178, 11.980752944946289], "id": 2398, "title": "Open-domain Video Commentary Generation", "authors": "Edison Marrese-Taylor, Yumi Hamazono, Tatsuya Ishigaki, Goran Topic, Yusuke Miyao, Ichiro Kobayashi and Hiroya Takamura", "abstract": "Live commentary plays an important role in sports broadcasts and video games, making spectators more excited and immersed. In this context, though approaches for automatically generating such commentary have been proposed in the past, they have been generally concerned with specific fields, where it is possible to leverage domain-specific information. In light of this, we propose the task of generating video commentary in an open-domain fashion. We detail the construction of a new large-scale dataset of transcribed commentary aligned with videos containing various human actions in a variety of domains, and propose approaches based on well-known neural architectures to tackle the task. To understand the strengths and limitations of current approaches, we present an in-depth empirical study based on our data. Our results suggest clear trade-offs between textual and visual inputs for the models and highlight the importance of relying on external knowledge in this open-domain setting, resulting in a set of robust baselines for our task.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.120742797851562, 3.018866777420044], "id": 2399, "title": "One size does not fit all: Investigating strategies for differentially-private learning across NLP tasks", "authors": "Manuel Senge, Timour Igamberdiev and Ivan Habernal", "abstract": "Preserving privacy in contemporary NLP models allows us to work with sensitive data, but unfortunately comes at a price. We know that stricter privacy guarantees in differentially-private stochastic gradient descent (DP-SGD) generally degrade model performance. However, previous research on the efficiency of DP-SGD in NLP is inconclusive or even counter-intuitive. In this short paper, we provide an extensive analysis of different privacy preserving strategies on seven downstream datasets in five different `typical' NLP tasks with varying complexity using modern neural models based on BERT and XtremeDistil architectures. We show that unlike standard non-private approaches to solving NLP tasks, where bigger is usually better, privacy-preserving strategies do not exhibit a winning pattern, and each task and privacy regime requires a special treatment to achieve adequate performance.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.556511878967285, 8.927638053894043], "id": 2401, "title": "Counterfactual Recipe Generation: Exploring Compositional Generalization in a Realistic Scenario", "authors": "Xiao Liu, Yansong Feng, Jizhi Tang, Chengang Hu and Dongyan Zhao", "abstract": "People can acquire knowledge in an unsupervised manner by reading, and compose the knowledge to make novel combinations. In this paper, we investigate whether pretrained language models can perform compositional generalization in a realistic setting: recipe generation. We design the counterfactual recipe generation task, which asks models to modify a base recipe according to the change of an ingredient. This task requires compositional generalization at two levels: the surface level of incorporating the new ingredient into the base recipe, and the deeper level of adjusting actions related to the changing ingredient. \nWe collect a large-scale recipe dataset in Chinese for models to learn culinary knowledge, and a subset of action-level fine-grained annotations for evaluation.\nWe finetune pretrained language models on the recipe corpus, and use unsupervised counterfactual generation methods to generate modified recipes.\nResults show that existing models have difficulties in modifying the ingredients while preserving the original text style, and often miss actions that need to be adjusted. Although pretrained language models can generate fluent recipe texts, they fail to truly learn and use the culinary knowledge in a compositional way. \nCode and data are available at https://github.com/xxxiaol/counterfactual-recipe-generation.", "track": "Theme Track", "label": 18}, {"loc": [7.699915409088135, 7.951632499694824], "id": 2406, "title": "Tutoring Helps Students Learn Better: Improving Knowledge Distillation for BERT with Tutor Network", "authors": "Junho Kim, Jun-Hyung Park, Mingyu Lee, Wing-Lam Mok, Joon-Young Choi and SangKeun Lee", "abstract": "Pre-trained language models have achieved remarkable successes in natural language processing tasks, coming at the cost of increasing model size. To address this issue, knowledge distillation (KD) has been widely applied to compress language models. However, typical KD approaches for language models have overlooked the difficulty of training examples, suffering from incorrect teacher prediction transfer and sub-efficient training. In this paper, we propose a novel KD framework, Tutor-KD, which improves the distillation effectiveness by controlling the difficulty of training examples during pre-training. We introduce a tutor network that generates samples that are easy for the teacher but difficult for the student, with training on a carefully designed policy gradient method. Experimental results show that Tutor-KD significantly and consistently outperforms the state-of-the-art KD methods with variously sized student models on the GLUE benchmark, demonstrating that the tutor can effectively generate training examples for the student.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.79207706451416, 6.52491569519043], "id": 2414, "title": "Does Corpus Quality Really Matter for Low-Resource Languages?", "authors": "Mikel Artetxe, Itziar Aldabe, Rodrigo Agerri, Olatz Perez-de-Vi\u00f1aspre and Aitor Soroa", "abstract": "The vast majority of non-English corpora are derived from automatically filtered versions of CommonCrawl. While prior work has identified major issues on the quality of these datasets (Kreutzer et al., 2021), it is not clear how this impacts downstream performance. Taking representation learning in Basque as a case study, we explore tailored crawling (manually identifying and scraping websites with high-quality content) as an alternative to filtering CommonCrawl. Our new corpus, called EusCrawl, is similar in size to the Basque portion of popular multilingual corpora like CC100 and mC4, yet it has a much higher quality according to native annotators. For instance, 66% of documents are rated as high-quality for EusCrawl, in contrast with <33% for both mC4 and CC100. Nevertheless, we obtain similar results on downstream NLU tasks regardless of the corpus used for pre-training. Our work suggests that NLU performance in low-resource languages is not primarily constrained by the quality of the data, and other factors like corpus size and domain coverage can play a more important role.", "track": "Multilinguality", "label": 13}, {"loc": [5.822017192840576, 5.82810115814209], "id": 2419, "title": "Unifying Data Perspectivism and Personalization: An Application to Social Norms", "authors": "Joan Plepi, B\u00e9la Neuendorf, Lucie Flek and Charles Welch", "abstract": "Instead of using a single ground truth for language processing tasks, several recent studies have examined how to represent and predict the labels of the set of annotators. However, often little or no information about annotators is known, or the set of annotators is small. In this work, we examine a corpus of social media posts about conflict from a set of 13k annotators and 210k judgements of social norms. We provide a novel experimental setup that applies personalization methods to the modeling of annotators and compare their effectiveness for predicting the perception of social norms. We further provide an analysis of performance across subsets of social situations that vary by the closeness of the relationship between parties in conflict, and assess where personalization helps the most.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [5.447420597076416, 5.082921028137207], "id": 2425, "title": "Does Self-Rationalization Improve Robustness to Spurious Correlations?", "authors": "Alexis Ross, Matthew Peters and Ana Marasovic", "abstract": "Rationalization is fundamental to human reasoning and learning. NLP models trained to produce rationales along with predictions, called self-rationalization models, have been investigated for their interpretability and utility to end-users. However, the extent to which training with human-written rationales facilitates learning remains an under-explored question. We ask whether training models to self-rationalize can aid in their learning to solve tasks for the right reasons. Specifically, we evaluate how training self-rationalization models with free-text rationales affects robustness to spurious correlations in fine-tuned encoder-decoder and decoder-only models of six different sizes. We evaluate robustness to spurious correlations by measuring performance on 1) manually annotated challenge datasets and 2) subsets of original test sets where reliance on spurious correlations would fail to produce correct answers. We find that while self-rationalization can improve robustness to spurious correlations in low-resource settings, it tends to hurt robustness in higher-resource settings. Furthermore, these effects depend on model family and size, as well as on rationale content. Together, our results suggest that explainability can come at the cost of robustness; thus, appropriate care should be taken when training self-rationalizing models with the goal of creating more trustworthy models.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.064118385314941, 7.698309898376465], "id": 2434, "title": "Efficient Pre-training of Masked Language Model via Concept-based Curriculum Masking", "authors": "Mingyu Lee, Jun-Hyung Park, Junho Kim, Kang-Min Kim and SangKeun Lee", "abstract": "Self-supervised pre-training has achieved remarkable success in extensive natural language processing tasks. Masked language modeling (MLM) has been widely used for pre-training effective bidirectional representations but comes at a substantial training cost. In this paper, we propose a novel concept-based curriculum masking (CCM) method to efficiently pre-train a language model. CCM has two key differences from existing curriculum learning approaches to effectively reflect the nature of MLM. First, we introduce a novel curriculum that evaluates the MLM difficulty of each token based on a carefully-designed linguistic difficulty criterion. Second, we construct a curriculum that masks easy words and phrases first and gradually masks related ones to the previously masked ones based on a knowledge graph. Experimental results show that CCM significantly improves pre-training efficiency. Specifically, the model trained with CCM shows comparative performance with the original BERT on the General Language Understanding Evaluation benchmark at half of the training cost.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [9.183977127075195, 6.353485584259033], "id": 2438, "title": "Subword Evenness (SuE) as a Predictor of Cross-lingual Transfer to Low-resource Languages", "authors": "Olga Pelloni, Anastassia Shaitarova and Tanja Samardzic", "abstract": "Pre-trained multilingual models, such as mBERT, XLM-R and mT5, are used to improve the performance on various tasks in low-resource languages via cross-lingual transfer. In this framework, English is usually seen as the most natural choice for a transfer language (for fine-tuning or continued training of a multilingual pre-trained model), but it has been revealed recently that this is often not the best choice. The success of cross-lingual transfer seems to depend on some properties of languages, which are currently hard to explain. Successful transfer often happens between unrelated languages and it often cannot be explained by data-dependent factors.\n\nIn this study, we show that languages written in non-Latin and non-alphabetic scripts (mostly Asian languages) are the best choices for improving performance on the task of Masked Language Modelling (MLM) in a diverse set of 30 low-resource languages and that the success of the transfer is well predicted by our novel measure of Subword Evenness (SuE). Transferring language models over the languages that score low on our measure results in the lowest average perplexity over target low-resource languages. Our correlation coefficients obtained with three different pre-trained multilingual models are consistently higher than all the other predictors, including text-based measures (type-token ratio, entropy) and linguistically motivated choice (genealogical and typological proximity).", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.066429138183594, 8.308318138122559], "id": 2439, "title": "A Unified Neural Network Model for Readability Assessment with Feature Projection and Length-Balanced Loss", "authors": "Wenbiao Li, Wang Ziyang and Yunfang Wu", "abstract": "Readability assessment is a basic research task in the field of education. Traditional methods mainly employ machine learning classifiers with hundreds of linguistic features. Although the deep learning model has become the prominent approach for almost all NLP tasks, it is less explored for readability assessment. In this paper, we propose a BERT-based model with feature projection and length-balanced loss (BERT-FP-LBL) to determine the difficulty level of a given text. First, we introduce topic features guided by difficulty knowledge to complement the traditional linguistic features. From the linguistic features, we extract really useful orthogonal features to supplement BERT representations by means of projection filtering. Furthermore, we design a length-balanced loss to handle the greatly varying length distribution of the readability data. We conduct experiments on three English benchmark datasets and one Chinese dataset, and the experimental results show that our proposed model achieves significant improvements over baseline models. Interestingly, our proposed model achieves comparable results with human experts in consistency test.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [6.561717987060547, 11.84943675994873], "id": 2444, "title": "Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis", "authors": "Zhihao Du, ShiLiang Zhang, Siqi Zheng and Zhi-Jie Yan", "abstract": "Recently, hybrid systems of clustering and neural diarization models have been successfully applied in multi-party meeting analysis. However, current models always treat overlapped speaker diarization as a multi-label classification problem, where speaker dependency and overlaps are not well considered. To overcome the disadvantages, we reformulate overlapped speaker diarization task as a single-label prediction problem via the proposed power set encoding (PSE). Through this formulation, speaker dependency and overlaps can be explicitly modeled. To fully leverage this formulation, we further propose the speaker overlap-aware neural diarization (SOND) model, which consists of a context-independent (CI) scorer to model global speaker discriminability, a context-dependent scorer (CD) to model local discriminability, and a speaker combining network (SCN) to combine and reassign speaker activities. Experimental results show that using the proposed formulation can outperform the state-of-the-art methods based on target speaker voice activity detection, and the performance can be further improved with SOND, resulting in a 6.30% relative diarization error reduction.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.801635265350342, 5.909019947052002], "id": 2447, "title": "GREENER: Graph Neural Networks for News Media Profiling", "authors": "Panayot Panayotov, Utsav Shukla, Husrev Taha Sencar, Mohamed Nabeel and Preslav Nakov", "abstract": "We study the problem of profiling news media on the Web with respect to their factuality of reporting and bias. This is an important but under-studied problem related to disinformation and ``fake news'' detection, but it addresses the issue at a coarser granularity compared to looking at an individual article or an individual claim. This is useful as it allows to profile entire media outlets in advance. Unlike previous work, which has focused primarily on text (e.g.,~on the text of the articles published by the target website, or on the textual description in their social media profiles or in Wikipedia), here our main focus is on modeling the similarity between media outlets based on the overlap of their audience. This is motivated by homophily considerations, i.e.,~the tendency of people to have connections to people with similar interests, which we extend to media, hypothesizing that similar types of media would be read by similar kinds of users. In particular, we propose GREENER (GRaph nEural nEtwork for News mEdia pRofiling), a model that builds a graph of inter-media connections based on their audience overlap, and then uses graph neural networks to represent each medium. We find that such representations are quite useful for predicting the factuality and the bias of news media outlets, yielding improvements over state-of-the-art results reported on two datasets. When augmented with conventionally used representations obtained from news articles, Twitter, YouTube, Facebook, and Wikipedia, prediction accuracy is found to improve by 2.5-27 macro-F1 points for the two tasks.", "track": "NLP Applications", "label": 0}, {"loc": [0.6769068837165833, 6.5195817947387695], "id": 2452, "title": "Graph Hawkes Transformer for Extrapolated Reasoning on Temporal Knowledge Graphs", "authors": "Haohai Sun, Shangyi Geng, Jialun Zhong, Han Hu and Kun He", "abstract": "Temporal Knowledge Graph (TKG) reasoning has attracted increasing attention due to its enormous potential value, and the critical issue is how to model the complex temporal structure information effectively. Recent studies use the method of encoding graph snapshots into hidden vector space and then performing heuristic deductions, which perform well on the task of entity prediction. However, these approaches cannot predict when an event will occur and have the following limitations: 1) there are many facts not related to the query that can confuse the model; 2) there exists information forgetting caused by long-term evolutionary processes. To this end, we propose a Graph Hawkes Transformer (GHT) for both TKG entity prediction and time prediction tasks in the future time. In GHT, there are two variants of Transformer, which capture the instantaneous structural information and temporal evolution information, respectively, and a new relational continuous-time encoding function to facilitate feature evolution with the Hawkes process. Extensive experiments on four public datasets demonstrate its superior performance, especially on long-term evolutionary tasks.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [3.9995765686035156, 4.112773895263672], "id": 2459, "title": "UniRPG: Unified Discrete Reasoning over Table and Text as Program Generation", "authors": "Yongwei Zhou, Junwei Bao, Chaoqun Duan, Youzheng Wu, Xiaodong He and Tiejun Zhao", "abstract": "Question answering requiring discrete reasoning, e.g., arithmetic computing, comparison, and counting, over knowledge is a challenging task.\nIn this paper, we propose UniRPG, a semantic-parsing-based approach advanced in interpretability and scalability, to perform Unified discrete Reasoning over heterogeneous knowledge resources, i.e., table and text, as Program Generation. \nConcretely, UniRPG consists of a neural programmer and a symbolic program executor,\nwhere a program is the composition of a set of pre-defined general atomic and higher-order operations and arguments extracted from table and text.\nFirst, the programmer parses a question into a program by generating operations and copying arguments, and then, the executor derives answers from table and text based on the program.\nTo alleviate the costly program annotation issue, we design a distant supervision approach for programmer learning, where pseudo programs are automatically constructed without annotated derivations.\nExtensive experiments on the TAT-QA dataset show that UniRPG achieves tremendous improvements and enhances interpretability and scalability compared with previous state-of-the-art methods, even without derivation annotation.\nMoreover, it achieves promising performance on the textual dataset DROP without derivation annotation.", "track": "Question Answering", "label": 11}, {"loc": [8.113082885742188, 9.489667892456055], "id": 2464, "title": "Don't Prompt, Search! Mining-based Zero-Shot Learning with Language Models", "authors": "Mozes van de Kar, Mengzhou Xia, Danqi Chen and Mikel Artetxe", "abstract": "Masked language models like BERT can perform text classification in a zero-shot fashion by reformulating downstream tasks as text infilling. However, this approach is highly sensitive to the template used to prompt the model, yet practitioners are blind when designing them in strict zero-shot settings. In this paper, we propose an alternative mining-based approach for zero-shot learning. Instead of prompting language models, we use regular expressions to mine labeled examples from unlabeled corpora, which can optionally be filtered through prompting, and used to finetune a pretrained model. Our method is more flexible and interpretable than prompting, and outperforms it on a wide range of tasks when using comparable templates. Our results suggest that the success of prompting can partly be explained by the model being exposed to similar examples during pretraining, which can be directly retrieved through regular expressions.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [1.0529420375823975, 10.533020973205566], "id": 2474, "title": "SEMGraph: Incorporating Sentiment Knowledge and Eye Movement into Graph Model for Sentiment Analysis", "authors": "Bingbing Wang, Bin Liang, Jiachen Du, Min Yang and Ruifeng Xu", "abstract": "This paper investigates the sentiment analysis task from a novel perspective by incorporating sentiment knowledge and eye movement into a graph architecture, aiming to draw the eye movement-based sentiment relationships for learning the sentiment expression of the context. To be specific, we first explore a linguistic probing eye movement paradigm to extract eye movement features based on the close relationship between linguistic features and the early and late processes of human reading behavior. Furthermore, to derive eye movement features with sentiment concepts, we devise a novel weighting strategy to integrate sentiment scores extracted from affective commonsense knowledge into eye movement features, called sentiment-eye movement weights. Then, the sentiment-eye movement weights are exploited to build the sentiment-eye movement guided graph (SEMGraph) model, so as to model the intricate sentiment relationships in the context. Experimental results on two sentiment analysis datasets with eye movement signals and three sentiment analysis datasets without eye movement signals show that the proposed SEMGraph achieves state-of-the-art performance, and can also be directly generalized to those sentiment analysis datasets without eye movement signals.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.704315185546875, 6.887051582336426], "id": 2479, "title": "Cross-lingual neural fuzzy matching for exploiting target-language monolingual corpora in computer-aided translation", "authors": "Miquel Espl\u00e0-Gomis, V\u00edctor M. S\u00e1nchez-Cartagena, Juan Antonio P\u00e9rez-Ortiz and Felipe S\u00e1nchez-Mart\u00ednez", "abstract": "Computer-aided translation (CAT) tools based on translation memories (MT) play a prominent role in the translation workflow of professional translators. However, the reduced availability of in-domain TMs, as compared to in-domain monolingual corpora, limits its adoption for a number of translation tasks. In this paper, we introduce a novel neural approach aimed at overcoming this limitation by exploiting not only TMs, but also in-domain target-language (TL) monolingual corpora, and still enabling a similar functionality to that offered by conventional TM-based CAT tools. Our approach relies on cross-lingual sentence embeddings to retrieve translation proposals from TL monolingual corpora, and on a neural model to estimate their post-editing effort. The paper presents an automatic evaluation of these techniques on four language pairs that shows that our approach can successfully exploit monolingual texts in a TM-based CAT environment, increasing the amount of useful translation proposals, and that our neural model for estimating the post-editing effort enables the combination of translation proposals obtained from monolingual corpora and from TMs in the usual way. A human evaluation performed on a single language pair confirms the results of the automatic evaluation and seems to indicate that the translation proposals retrieved with our approach are more useful than what the automatic evaluation shows.", "track": "NLP Applications", "label": 0}, {"loc": [4.404667854309082, 7.503066062927246], "id": 2491, "title": "Multi-Label Intent Detection via Contrastive Task Specialization of Sentence Encoders", "authors": "Ivan Vuli\u0107, I\u00f1igo Casanueva, Georgios Spithourakis, Avishek Mondal, Tsung-Hsien Wen and Pawe\u0142 Budzianowski", "abstract": "Deploying task-oriented dialog ToD systems for new domains and tasks requires natural language understanding models that are 1) resource-efficient and work under low-data regimes; 2) adaptable, efficient, and quick-to-train; 3) expressive and can handle complex ToD scenarios with multiple user intents in a single utterance. Motivated by these requirements, we introduce a novel framework for multi-label intent detection (mID): MultI-ConvFiT (Multi-Label Intent Detection via Contrastive Conversational Fine-Tuning). While previous work on efficient single-label intent detection learns a classifier on top of a fixed sentence encoder (SE), we propose to 1) transform general-purpose SEs into task-specialized SEs via contrastive fine-tuning on annotated multi-label data, 2) where task specialization knowledge can be stored into lightweight adapter modules without updating the original parameters of the input SE, and then 3) we build improved mID classifiers stacked on top of fixed specialized SEs. Our main results indicate that MultI-ConvFiT yields effective mID models, with large gains over non-specialized SEs reported across a spectrum of different mID datasets, both in low-data and high-data regimes.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.08772087097168, 6.259606838226318], "id": 2495, "title": "Discovering Language-neutral Sub-networks in Multilingual Language Models", "authors": "Negar Foroutan, Mohammadreza Banaei, R\u00e9mi Lebret, Antoine Bosselut and Karl Aberer", "abstract": "Multilingual pre-trained language models transfer remarkably well on cross-lingual downstream tasks. However, the extent to which they learn language-neutral representations (i.e., shared representations that encode similar phenomena across languages), and the effect of such representations on cross-lingual transfer performance, remain open questions.\n\nIn this work, we conceptualize language neutrality of multilingual models as a function of the overlap between language-encoding sub-networks of these models. We employ the lottery ticket hypothesis to discover sub-networks that are individually optimized for various languages and tasks. Our evaluation across three distinct tasks and eleven typologically-diverse languages demonstrates that sub-networks for different languages are topologically similar (i.e., language-neutral), making them effective initializations for cross-lingual transfer with limited performance degradation.", "track": "Multilinguality", "label": 13}, {"loc": [8.636474609375, 8.399713516235352], "id": 2496, "title": "Parameter-Efficient Tuning Makes a Good Classification Head", "authors": "Zhuoyi Yang, Ming Ding, Yanhui Guo, Qingsong Lv and Jie Tang", "abstract": "In recent years, pretrained models revolutionized the paradigm of natural language understanding (NLU), where we append a randomly initialized classification head after the pretrained backbone, e.g. BERT, and finetune the whole model. As the pretrained backbone makes a major contribution to the improvement, we naturally expect a good pretrained classification head can also benefit the training. However, the final-layer output of the backbone, i.e. the input of the classification head, will change greatly during finetuning, making the usual head-only pretraining ineffective. In this paper, we find that parameter-efficient tuning makes a good classification head, with which we can simply replace the randomly initialized heads for a stable performance gain. Our experiments demonstrate that the classification head jointly pretrained with parameter-efficient tuning consistently improves the performance on 9 tasks in GLUE and SuperGLUE.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.28510856628418, 9.448281288146973], "id": 2499, "title": "STGN: an Implicit Regularization Method for Learning with Noisy Labels in Natural Language Processing", "authors": "Tingting Wu, Xiao Ding, Minji Tang, Hao Zhang, Bing Qin and Ting Liu", "abstract": "Noisy labels are ubiquitous in natural language processing (NLP) tasks. Existing work, namely learning with noisy labels in NLP, is often limited to dedicated tasks or specific training procedures, making it hard to be widely used. To address this issue, SGD noise has been explored to provide a more general way to alleviate the effect of noisy labels by involving benign noise in the process of stochastic gradient descent. \nHowever, previous studies exert identical perturbation for all samples, which may cause overfitting on incorrect ones or optimizing correct ones inadequately. To facilitate this, we propose a novel stochastic tailor-made gradient noise (STGN), mitigating the effect of inherent label noise by introducing tailor-made benign noise for each sample. Specifically, we investigate multiple principles to precisely and stably discriminate correct samples from incorrect ones and thus apply different intensities of perturbation to them. A detailed theoretical analysis shows that STGN has good properties, beneficial for model generalization. Experiments on three different NLP tasks demonstrate the effectiveness and versatility of STGN. Also, STGN can boost existing robust training methods.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.308909893035889, 12.468986511230469], "id": 2506, "title": "Cross-Modal Similarity-Based Curriculum Learning for Image Captioning", "authors": "Hongkuan Zhang, Saku Sugawara, Akiko Aizawa, Lei Zhou, Ryohei Sasano and Koichi Takeda", "abstract": "Image captioning models require the high-level generalization ability to describe the contents of various images in words. Most existing approaches treat the image--caption pairs equally in their training without considering the differences in their learning difficulties. Several image captioning approaches introduce curriculum learning methods that present training data with increasing levels of difficulty. However, their difficulty measurements are either based on domain-specific features or prior model training. In this paper, we propose a simple yet efficient difficulty measurement for image captioning using cross-modal similarity calculated by a pretrained vision--language model. Experiments on the COCO and Flickr30k datasets show that our proposed approach achieves superior performance and competitive convergence speed to baselines without requiring heuristics or incurring additional training costs. Moreover, the higher model performance on difficult examples and unseen data also demonstrates the generalization ability.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.291426181793213, 5.353306293487549], "id": 2513, "title": "Debiasing Masks: A New Framework for Shortcut Mitigation in NLU", "authors": "Johannes Mario Meissner, Saku Sugawara and Akiko Aizawa", "abstract": "Debiasing language models from unwanted behaviors in Natural Language Understanding (NLU) tasks is a topic with rapidly increasing interest in the NLP community. Spurious statistical correlations in the data allow models to perform shortcuts and avoid uncovering more advanced and desirable linguistic features.\nA multitude of effective debiasing approaches has been proposed, but flexibility remains a major issue. For the most part, models must be retrained to find a new set of weights with debiased behavior.\nWe propose a new debiasing method in which we identify debiased pruning masks that can be applied to a finetuned model. This enables the selective and conditional application of debiasing behaviors.\nWe assume that bias is caused by a certain subset of weights in the network; our method is, in essence, a mask search to identify and remove biased weights.\nOur masks show equivalent or superior performance to the standard counterparts, while offering important benefits.\nPruning masks can be stored with high efficiency in memory, and it becomes possible to switch among several debiasing behaviors (or revert back to the original biased model) at inference time. Finally, it opens the doors to further research on how biases are acquired by studying the generated masks. For example, we observed that the early layers and attention heads were pruned more aggressively, possibly hinting towards the location in which biases may be encoded.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.322630405426025, 12.388042449951172], "id": 2528, "title": "Extending Phrase Grounding with Pronouns in Visual Dialogues", "authors": "Panzhong Lu, Xin Zhang, Meishan Zhang and Min Zhang", "abstract": "Conventional phrase grounding aims to localize noun phrases mentioned in a given caption to their corresponding image regions, which has achieved great success recently. Apparently, sole noun phrase grounding is not enough for cross-modal visual language understanding. Here we extend the task by considering pronouns as well. First, we construct a dataset of phrase grounding with both noun phrases and pronouns to image regions. Based on the dataset, we test the performance of phrase grounding by using a state-of-the-art literature model of this line. Then, we enhance the baseline grounding model with coreference information which should help our task potentially, modeling the coreference structures with graph convolutional networks. Experiments on our dataset, interestingly, show that pronouns are easier to ground than noun phrases, where the possible reason might be that these pronouns are much less ambiguous. Additionally, our final model with coreference information can significantly boost the grounding performance of both noun phrases and pronouns.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.7409400939941406, 9.34959888458252], "id": 2537, "title": "EUR-Lex-Sum: A Multi- and Cross-lingual Dataset for Long-form Summarization in the Legal Domain", "authors": "Dennis Aumiller, Ashish Chouhan and Michael Gertz", "abstract": "Existing summarization datasets come with two main drawbacks: \n(1) They tend to focus on overly exposed domains, such as news articles or wiki-like texts, and \n(2) are primarily monolingual, with few multilingual datasets.\nIn this work, we propose a novel dataset, called EUR-Lex-Sum, based on manually curated document summaries of legal acts from the European Union law platform (EUR-Lex). Documents and their respective summaries exist as cross-lingual paragraph-aligned data in several of the 24 official European languages, enabling access to various cross-lingual and lower-resourced summarization setups. We obtain up to 1,500 document/summary pairs per language, including a subset of 375 cross-lingually aligned legal acts with texts available in *all* 24 languages. \nIn this work, the data acquisition process is detailed and key characteristics of the resource are compared to existing summarization resources. In particular, we illustrate challenging sub-problems and open questions on the dataset that could help the facilitation of future research in the direction of domain-specific cross-lingual summarization.\nLimited by the extreme length and language diversity of samples, we further conduct experiments with suitable extractive monolingual and cross-lingual baselines for future work. \nCode for the extraction as well as access to our data and baselines is available online at: [https://github.com/achouhan93/eur-lex-sum](https://github.com/achouhan93/eur-lex-sum).", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.030890464782715, 5.760574817657471], "id": 2541, "title": "Differentiable Data Augmentation for Contrastive Sentence Representation Learning", "authors": "Tianduo Wang and Wei Lu", "abstract": "Fine-tuning a pre-trained language model via the contrastive learning framework with a large amount of unlabeled sentences or labeled sentence pairs is a common way to obtain high-quality sentence representations. Although the contrastive learning framework has shown its superiority on sentence representation learning over previous methods, the potential of such a framework is under-explored so far due to the simple method it used to construct positive pairs. Motivated by this, we propose a method that makes hard positives from the original training examples. A pivotal ingredient of our approach is the use of prefix that attached to a pre-trained language model, which allows for differentiable data augmentation during contrastive learning. Our method can be summarized in two steps: supervised prefix-tuning followed by joint contrastive fine-tuning with unlabeled or labeled examples. Our experiments confirm the effectiveness of our data augmentation approach. The proposed method yields significant improvements over existing methods under both semi-supervised and supervised settings. Our experiments under a low labeled data setting also show that our method is more label-efficient than the state-of-the-art contrastive learning methods.", "track": "Ethic Concerns:Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.9171576499938965, 9.150935173034668], "id": 2545, "title": "Text Style Transferring via Adversarial Masking and Styled Filling", "authors": "Jiarui Wang, Richong Zhang, Junfan Chen, Jaein Kim and Yongyi Mao", "abstract": "Text style transfer is an important task in natural language processing with broad applications. Existing models following the masking and filling scheme suffer two challenges: the word masking procedure may mistakenly remove unexpected words and the selected words in the word filling procedure may lack diversity and semantic consistency. To tackle both challenges, in this study, we propose a style transfer model, with an adversarial masking approach and a styled filling technique (AMSF). Specifically, AMSF first trains a mask predictor by adversarial training without manual configuration. Then two additional losses, i.e. an entropy maximization loss and a consistency regularization loss, are introduced in training the word filling module to guarantee the diversity and semantic consistency of the transferred texts. Experimental results and analysis on two benchmark text style transfer data sets demonstrate the effectiveness of the proposed approaches.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [7.644128322601318, 3.624854803085327], "id": 2550, "title": "Character-level White-Box Adversarial Attacks against Transformers via Attachable Subwords Substitution", "authors": "Aiwei Liu, Honghai Yu, Xuming Hu, Shu'ang Li, Li Lin, Fukun Ma, Yawen Yang and Lijie Wen", "abstract": "We propose the first character-level white-box adversarial attack method against transformer models. The intuition of our method comes from the observation that words are split into subtokens before being fed into the transformer models and the substitution between two close subtokens has a similar effect with the character modification. Our method mainly contains three steps. First, a gradient-based method is adopted to find the most vulnerable words in the sentence. Then we split the selected words into subtokens to replace the origin tokenization result from the transformer tokenizer. Finally, we utilize an adversarial loss to guide the substitution of attachable subtokens in which the Gumbel-softmax trick is introduced to ensure gradient propagation.\nMeanwhile, we introduce the visual and length constraint in the optimization process to achieve minimum character modifications.\nExtensive experiments on both sentence-level and token-level tasks demonstrate that our method could outperform the previous attack methods in terms of success rate and edit distance. Furthermore, human evaluation verifies our adversarial examples could preserve their origin labels.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [0.6112386584281921, 7.622259140014648], "id": 2556, "title": "Query-based Instance Discrimination Network for Relational Triple Extraction", "authors": "Zeqi Tan, Yongliang Shen, Xuming Hu, Wenqi Zhang, Xiaoxia Cheng, Weiming Lu and Yueting Zhuang", "abstract": "Joint entity and relation extraction has been a core task in the field of information extraction. Recent approaches usually consider the extraction of relational triples from a stereoscopic perspective, either learning a relation-specific tagger or separate classifiers for each relation type. However, they still suffer from error propagation, relation redundancy and lack of high-level connections between triples. To address these issues, we propose a novel query-based approach to construct instance-level representations for relational triples. By metric-based comparison between query embeddings and token embeddings, we can extract all types of triples in one step, thus eliminating the error propagation problem. In addition, we learn the instance-level representation of relational triples via contrastive learning. In this way, relational triples can not only enclose rich class-level semantics but also access to high-order global connections. Experimental results show that our proposed method achieves the state of the art on five widely used benchmarks.", "track": "Information Extraction", "label": 5}, {"loc": [0.6411269903182983, 7.273882865905762], "id": 2558, "title": "Learning Inter-Entity-Interaction for Few-Shot Knowledge Graph Completion", "authors": "Yuling Li, Kui Yu, Xiaoling Huang and Yuhong Zhang", "abstract": "Few-shot knowledge graph completion (FKGC) aims to infer unknown fact triples of a relation using its few-shot reference entity pairs. Recent FKGC studies focus on learning semantic representations of entity pairs by separately encoding the neighborhoods of head and tail entities. Such practice, however, ignores the inter-entity interaction, resulting in low-discrimination representations for entity pairs, especially when these entity pairs are associated with 1-to-N, N-to-1, and N-to-N relations. To address this issue, this paper proposes a novel FKGC model, named Cross-Interaction Attention Network (CIAN) to investigate the inter-entity interaction between head and tail entities. Specifically, we first explore the interactions within entities by computing the attention between the task relation and each entity neighbor, and then model the interactions between head and tail entities by letting an entity to attend to the neighborhood of its paired entity. In this way, CIAN can figure out the relevant semantics between head and tail entities, thereby generating more discriminative representations for entity pairs. Extensive experiments on two public datasets show that CIAN outperforms several state-of-the-art methods. The source code is available at \\url{https://github.com/cjlyl/FKGC-CIAN}.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.538283824920654, 5.569461822509766], "id": 2564, "title": "Empowering the Fact-checkers! Automatic Identification of Claim Spans on Twitter", "authors": "Megha Sundriyal, Atharva Kulkarni, Vaibhav Pulastya, Md. Shad Akhtar and Tanmoy Chakraborty", "abstract": "The widespread diffusion of medical and political claims in the wake of COVID-19 has led to a voluminous rise in misinformation and fake news. The current vogue is to employ manual fact-checkers to efficiently classify and verify such data to combat this avalanche of claim-ridden misinformation. However, the rate of information dissemination is such that it vastly outpaces the fact-checkers' strength. Therefore, to aid manual fact-checkers in eliminating the superfluous content, it becomes imperative to automatically identify and extract the snippets of claim-worthy (mis)information present in a post. In this work, we introduce the novel task of Claim Span Identification (CSI). We propose CURT, a large-scale Twitter corpus with token-level claim spans on more than 7.5k tweets. Furthermore, along with the standard token classification baselines, we benchmark our dataset with DABERTa, an adapter-based variation of RoBERTa. The experimental results attest that DABERTa outperforms the baseline systems across several evaluation metrics, improving by about 1.5 points. We also report detailed error analysis to validate the model's performance along with the ablation studies. Lastly, we release our comprehensive span annotation guidelines for public use.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [3.6693637371063232, 8.173981666564941], "id": 2566, "title": "ClidSum: A Benchmark Dataset for Cross-Lingual Dialogue Summarization", "authors": "Jiaan Wang, Fandong Meng, Ziyao Lu, Duo Zheng, Zhixu Li, Jianfeng Qu and Jie Zhou", "abstract": "We present ClidSum, a benchmark dataset towards building cross-lingual summarization systems on dialogue documents. It consists of 67k+ dialogue documents and 112k+ annotated summaries in different target languages. Based on the proposed ClidSum, we introduce two benchmark settings for supervised and semi-supervised scenarios, respectively. We then build various baseline systems in different paradigms (pipeline and end-to-end) and conduct extensive experiments on ClidSum to provide deeper analyses. Furthermore, we propose mDialBART which extends mBART via further pre-training, where the multiple objectives help the pre-trained model capture the structural characteristics as well as key content in dialogues and the transformation from source to the target language. Experimental results show the superiority of mDialBART, as an end-to-end model, outperforms strong pipeline models on ClidSum. Finally, we discuss specific challenges that current approaches faced with this task and give multiple promising directions for future research. We have released the dataset and code at https://github.com/krystalan/ClidSum.", "track": "Summarization", "label": 14}, {"loc": [9.026941299438477, 6.127098560333252], "id": 2578, "title": "Spectral Probing", "authors": "Max M\u00fcller-Eberstein, Rob van der Goot and Barbara Plank", "abstract": "Linguistic information is encoded at varying timescales (subwords, phrases, etc.) and communicative levels, such as syntax and semantics. Contextualized embeddings have analogously been found to capture these phenomena at distinctive layers and frequencies. Leveraging these findings, we develop a fully learnable frequency filter to identify spectral profiles for any given task. It enables vastly more granular analyses than prior handcrafted filters, and improves on efficiency. After demonstrating the informativeness of spectral probing over manual filters in a monolingual setting, we investigate its multilingual characteristics across seven diverse NLP tasks in six languages. Our analyses identify distinctive spectral profiles which quantify cross-task similarity in a linguistically intuitive manner, while remaining consistent across languages\u2014highlighting their potential as robust, lightweight task descriptors.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.7154057025909424, 5.4278035163879395], "id": 2580, "title": "QASem Parsing: Text-to-text Modeling of QA-based Semantics", "authors": "Ayal Klein, Eran Hirsch, Ron Eliav, Valentina Pyatkin, Avi Caciularu and Ido Dagan", "abstract": "Various works suggest the appeals of incorporating explicit semantic representations when addressing challenging realistic NLP scenarios. Common approaches offer either comprehensive linguistically-based formalisms, like AMR, or alternatively Open-IE, which provides a shallow and partial representation. More recently, an appealing trend introduces semi-structured natural-language structures as an intermediate meaning-capturing representation, often in the form of questions and answers.\n\nIn this work, we further promote this line of research by considering three prior QA-based semantic representations. These cover verbal, nominalized and discourse-based predications, regarded as jointly providing a comprehensive representation of textual information --- termed QASem. To facilitate this perspective, we investigate how to best utilize pre-trained sequence-to-sequence language models, which seem particularly promising for generating representations that consist of natural language expressions (questions and answers). In particular, we examine and analyze input and output linearization strategies, as well as data augmentation and multitask learning for a scarce training data setup. Consequently, we release the first unified QASem parsing tool, easily applicable for downstream tasks that can benefit from an explicit semi-structured account of information units in text.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.0954155921936035, 9.629100799560547], "id": 2585, "title": "Keyphrase Generation via Soft and Hard Semantic Corrections", "authors": "Guangzhen Zhao, Guoshun Yin, Peng Yang and Yu Yao", "abstract": "Keyphrase generation aims to generate a set of condensed phrases given a source document. Although maximum likelihood estimation (MLE) based keyphrase generation methods have shown impressive performance, they suffer from the bias on the source-prediction sequence pair and the bias on the prediction-target pair. To tackle the above biases, we propose a novel correction model CorrKG on top of the MLE pipeline, where the biases are corrected via the optimal transport (OT) and a frequency-based filtering-and-sorting (FreqFS) strategy. Specifically, OT is introduced as soft correction to facilitate the alignment of salient information and rectify the semantic bias in the source document and predicted keyphrases pair. An adaptive semantic mass learning scheme is conducted on the vanilla OT to achieve a proper pair-wise optimal transport procedure, which promotes the OT learning brought by rectifying semantic masses dynamically. Besides, the FreqFS strategy is designed as hard correction to reduce the bias of predicted and ground truth keyphrases, and thus to generate accurate and sufficient keyphrases. Extensive experiments over multiple benchmark datasets show that our model achieves superior keyphrase generation as compared with the state-of-the-arts.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.596837520599365, 11.664127349853516], "id": 2589, "title": "Modal-specific Pseudo Query Generation for Video Corpus Moment Retrieval", "authors": "MinJoon Jung, SeongHo Choi, JooChan Kim, Jin-Hwa Kim and Byoung-Tak Zhang", "abstract": "Video corpus moment retrieval (VCMR) is the task to retrieve the most relevant video moment from a large video corpus using a natural language query.\nFor narrative videos, e.g., drama or movies, the holistic understanding of temporal dynamics and multimodal reasoning are crucial.\nPrevious works have shown promising results; however, they relied on the expensive query annotations for the VCMR, i.e., the corresponding moment intervals.\nTo overcome this problem, we propose a self-supervised learning framework: Modal-specific Pseudo Query Generation Network (MPGN).\nFirst, MPGN selects candidate temporal moments via subtitle-based moment sampling.\nThen, it generates pseudo queries exploiting both visual\nand textual information from the selected temporal moments.\nThrough the multimodal information in the pseudo queries, we show that MPGN successfully learns to localize the video corpus moment without any explicit annotation.\nWe validate the effectiveness of MPGN on TVR dataset, showing the competitive results compared with both supervised models and unsupervised setting models.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [3.35899019241333, 4.05472469329834], "id": 2606, "title": "DuQM: A Chinese Dataset of Linguistically Perturbed Natural Questions for Evaluating the Robustness of Question Matching Models", "authors": "Hongyu Zhu, Yan Chen, Jing Yan, Jing Liu, Yu Hong, Ying Chen, Hua Wu and Haifeng Wang", "abstract": "In this paper, we focus on the robustness evaluation of Chinese Question Matching (QM) models. Most of the previous work on analyzing robustness issues focus on just one or a few types of artificial adversarial examples. Instead, we argue that a comprehensive evaluation should be conducted on natural texts, which takes into account the fine-grained linguistic capabilities of QM models. For this purpose, we create a Chinese dataset namely DuQM which contains natural questions with linguistic perturbations to evaluate the robustness of QM models. DuQM contains 3 categories and 13 subcategories with 32 linguistic perturbations. The extensive experiments demonstrate that DuQM has a better ability to distinguish different models. Importantly, the detailed breakdown of evaluation by the linguistic phenomena in DuQM helps us easily diagnose the strength and weakness of different models. Additionally, our experiment results show that the effect of artificial adversarial examples does not work on natural texts. Our baseline codes and a leaderboard are now publicly available.", "track": "Question Answering", "label": 11}, {"loc": [10.882485389709473, 6.780794620513916], "id": 2609, "title": "DivEMT: Neural Machine Translation Post-Editing Effort Across Typologically Diverse Languages", "authors": "Gabriele Sarti, Arianna Bisazza, Ana Guerberof and Antonio Toral", "abstract": "We introduce DivEMT, the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times and pauses were recorded, enabling an in-depth, cross-lingual evaluation of NMT quality and post-editing effectiveness. Using this new dataset, we assess the impact of two state-of-the-art NMT systems, Google Translate and the multilingual mBART-50 model, on translation productivity. We find that post-editing is consistently faster than translation from scratch. However, the magnitude of productivity gains varies widely across systems and languages, highlighting major disparities in post-editing effectiveness for languages at different degrees of typological relatedness to English, even when controlling for system architecture and training data size. We publicly release the complete dataset including all collected behavioral data, to foster new research on the translation capabilities of NMT systems for typologically diverse languages.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.295383453369141, 5.520113945007324], "id": 2616, "title": "Bridging Fairness and Environmental Sustainability in Natural Language Processing", "authors": "Marius Hessenthaler, Emma Strubell, Dirk Hovy and Anne Lauscher", "abstract": "Fairness and environmental impact are important research directions for the sustainable development of artificial intelligence. However, while each topic is an active research area in natural language processing (NLP), there is a surprising lack of research on the interplay between the two fields. \nThis lacuna is highly problematic, since there is increasing evidence that an exclusive focus on fairness can actually hinder environmental sustainability, and vice versa. \nIn this work, we shed light on this crucial intersection in NLP by (1) investigating the efficiency of current fairness approaches through surveying example methods for reducing unfair stereotypical bias from the literature, and (2) evaluating a common technique to reduce energy consumption (and thus environmental impact) of English NLP models, knowledge distillation (KD), for its impact on fairness. \nIn this case study, we evaluate the effect of important KD factors, including layer and dimensionality reduction, with respect to: (a) performance on the distillation task (natural language inference and semantic similarity prediction), and (b) multiple measures and dimensions of stereotypical bias (e.g., gender bias measured via the Word Embedding Association Test). \nOur results lead us to clarify current assumptions regarding the effect of KD on unfair bias: contrary to other findings, we show that KD can actually decrease model fairness.", "track": "Theme Track", "label": 18}, {"loc": [6.517040252685547, 12.039569854736328], "id": 2620, "title": "UniMSE: Towards Unified Multimodal Sentiment Analysis and Emotion Recognition", "authors": "Guimin Hu, Ting-En Lin, Yi Zhao, Guangming Lu, Yuchuan Wu and Yongbin Li", "abstract": "Multimodal sentiment analysis (MSA) and emotion recognition in conversation (ERC) are key research topics for computers to understand human behaviors. From a psychological perspective, emotions are the expression of affect or feelings during a short period, while sentiments are formed and held for a longer period. However, most existing works study sentiment and emotion separately and do not fully exploit the complementary knowledge behind the two. In this paper, we propose a multimodal sentiment knowledge-sharing framework (UniMSE) that unifies MSA and ERC tasks from features, labels, and models. We perform modality fusion at the syntactic and semantic levels and introduce contrastive learning between modalities and samples to better capture the difference and consistency between sentiments and emotions. Experiments on four public benchmark datasets, MOSI, MOSEI, MELD, and IEMOCAP, demonstrate the effectiveness of the proposed method and achieve consistent improvements compared with state-of-the-art methods.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [6.962857723236084, 6.344462871551514], "id": 2626, "title": "Is the Brain Mechanism for Hierarchical Structure Building Universal Across Languages? An fMRI Study of Chinese and English", "authors": "Xiaohan Zhang, Shaonan Wang, Nan Lin and Chengqing Zong", "abstract": "Evidence from psycholinguistic studies suggests that the human brain builds a hierarchical syntactic structure during language comprehension. However, it is still unknown whether the neural basis of such structures is universal across languages. In this paper, we first analyze the differences in language structure between two diverse languages: Chinese and English. \nBy computing the working memory requirements when applying parsing strategies to different language structures, we find that top-down parsing generates less memory load for the right-branching English and bottom-up parsing is less memory-demanding for Chinese.\nThen we use functional magnetic resonance imaging (fMRI) to investigate whether the brain has different syntactic adaptation strategies in processing Chinese and English. Specifically, for both Chinese and English, we extract predictors from the implementations of different parsing strategies, i.e., bottom-up and top-down. Then, these predictors are separately associated with fMRI signals. \nResults show that for Chinese and English, the brain utilizes bottom-up and top-down parsing strategies separately. These results reveal that the brain adopts parsing strategies with less memory processing load according to different language structures.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [8.561875343322754, 7.85040807723999], "id": 2630, "title": "HashFormers: Towards Vocabulary-independent Pre-trained Transformers", "authors": "Huiyin Xue and Nikolaos Aletras", "abstract": "Transformer-based pre-trained language models are vocabulary-dependent, mapping by default each token to its corresponding embedding. This one-to-one mapping results into embedding matrices that occupy a lot of memory (i.e. millions of parameters) and grow linearly with the size of the vocabulary. Previous work on on-device transformers dynamically generate token embeddings on-the-fly without embedding matrices using locality-sensitive hashing over morphological information. These embeddings are subsequently fed into transformer layers for text classification. However, these methods are not pre-trained. Inspired by this line of work, we propose HashFormers, a new family of vocabulary-independent pre-trained transformers that support an unlimited vocabulary (i.e. all possible tokens in a corpus) given a substantially smaller fixed-sized embedding matrix. We achieve this by first introducing computationally cheap hashing functions that bucket together individual tokens to embeddings. We also propose three variants that do not require an embedding matrix at all, further reducing the memory requirements. We empirically demonstrate that HashFormers are more memory efficient compared to standard pre-trained transformers while achieving comparable predictive performance when fine-tuned on multiple text classification tasks. For example, our most efficient HashFormer variant has a negligible performance degradation (0.4% on GLUE) using only 99.1K parameters for representing the embeddings compared to 12.3-38M parameters of state-of-the-art models.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [0.7850092649459839, 7.930887222290039], "id": 2634, "title": "MatchPrompt: Prompt-based Open Relation Extraction with Semantic Consistency Guided Clustering", "authors": "Jiaxin Wang, Lingling Zhang, Jun Liu, Liang Xi, Yujie Zhong and Yaqiang Wu", "abstract": "Relation clustering is a general approach for open relation extraction (OpenRE). Current methods have two major problems. One is that their good performance relies on large amounts of labeled and pre-defined relational instances for pre-training, which are costly to acquire in reality. The other is that they only focus on learning a high-dimensional metric space to measure the similarity of novel relations and ignore the specific relational representations of clusters. In this work, we propose a new prompt-based framework named MatchPrompt, which can realize OpenRE with efficient knowledge transfer from only a few pre-defined relational instances as well as mine the specific meanings for cluster interpretability. To our best knowledge, we are the first to introduce a prompt-based framework for unlabeled clustering. Experimental results on different datasets show that MatchPrompt achieves the new SOTA results for OpenRE.", "track": "Information Extraction", "label": 5}, {"loc": [1.045127511024475, 10.527294158935547], "id": 2638, "title": "Improving Aspect Sentiment Quad Prediction via Template-Order Data Augmentation", "authors": "Mengting Hu, Yike Wu, Hang Gao, Yinhao Bai and Shiwan Zhao", "abstract": "Recently, aspect sentiment quad prediction (ASQP) has become a popular task in the field of aspect-level sentiment analysis. Previous work utilizes a predefined template to paraphrase the original sentence into a structure target sequence, which can be easily decoded as quadruplets of the form (aspect category, aspect term, opinion term, sentiment polarity). The template involves the four elements in a fixed order. However, we observe that this solution contradicts with the order-free property of the ASQP task, since there is no need to fix the template order as long as the quadruplet is extracted correctly. Inspired by the observation, we study the effects of template orders and find that some orders help the generative model achieve better performance. It is hypothesized that different orders provide various views of the quadruplet. Therefore, we propose a simple but effective method to identify the most proper orders, and further combine multiple proper templates as data augmentation to improve the ASQP task. Specifically, we use the pre-trained language model to select the orders with minimal entropy. By fine-tuning the pre-trained language model with these template orders, our approach improves the performance of quad prediction, and outperforms state-of-the-art methods significantly in low-resource settings.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [6.105708599090576, 5.932534217834473], "id": 2639, "title": "SocioProbe: What, When, and Where Language Models Learn about Sociodemographics", "authors": "Anne Lauscher, Federico Bianchi, Samuel R. Bowman and Dirk Hovy", "abstract": "Pre-trained language models (PLMs) have outperformed other NLP models on a wide range of tasks. \nOpting for a more thorough understanding of their capabilities and inner workings, researchers have established the extend to which they capture lower-level knowledge like grammaticality, and mid-level semantic knowledge like factual understanding. However, there is still little understanding of their knowledge of higher-level aspects of language. In particular, despite the importance of sociodemographic aspects in shaping our language, the questions of whether, where, and how PLMs encode these aspects, e.g., gender or age, is still unexplored. \nWe address this research gap by probing the sociodemographic knowledge of different single-GPU PLMs on multiple English data sets via traditional classifier probing and information-theoretic minimum description length probing. Our results show that PLMs do encode these sociodemographics, and that this knowledge is sometimes spread across the layers of some of the tested PLMs. We further conduct a multilingual analysis and investigate the effect of supplementary training to further explore to what extent, where, and with what amount of pre-training data the knowledge is encoded. \nOur overall results indicate that sociodemographic knowledge is still a major challenge for NLP. PLMs require large amounts of pre-training data to acquire the knowledge and models that excel in general language understanding do not seem to own more knowledge about these aspects.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.391593933105469, 7.04047966003418], "id": 2642, "title": "When does Parameter-Efficient Transfer Learning Work for Machine Translation?", "authors": "Ahmet \u00dcst\u00fcn and Asa Cooper Stickland", "abstract": "Parameter-efficient fine-tuning methods (PEFTs) offer the promise of adapting large pre-trained models while only tuning a small number of parameters. They have been shown to be competitive with full model fine-tuning for many downstream tasks. However, prior work indicates that PEFTs may not work as well for machine translation (MT), and there is no comprehensive study showing when PEFTs work for MT. We conduct a comprehensive empirical study of PEFTs for MT, considering (1) various parameter budgets, (2) a diverse set of language-pairs, and (3) different pre-trained models. \nWe find that 'adapters', in which small feed-forward networks are added after every layer, are indeed on par with full model fine-tuning when the parameter budget corresponds to 10% of total model parameters. Nevertheless, as the number of tuned parameters decreases, the performance of PEFTs decreases. The magnitude of this decrease depends on the language pair, with PEFTs particularly struggling for distantly related language-pairs. We find that using PEFTs with a larger pre-trained model outperforms full fine-tuning with a smaller model, and for smaller training data sizes, PEFTs outperform full fine-tuning for the same pre-trained model.", "track": "Machine Translation", "label": 10}, {"loc": [9.229597091674805, 6.99232816696167], "id": 2650, "title": "Hyper-X: A Unified Hypernetwork for Multi-Task Multilingual Transfer", "authors": "Ahmet \u00dcst\u00fcn, Arianna Bisazza, Gosse Bouma, Gertjan van Noord and Sebastian Ruder", "abstract": "Massively multilingual models are promising for transfer learning across tasks and languages. However, existing methods are unable to fully leverage training data when it is available in different task-language combinations. To exploit such heterogeneous supervision, we propose Hyper-X, a single hypernetwork that unifies multi-task and multilingual learning with efficient adaptation. It generates weights for adapter modules conditioned on both tasks and language embeddings. By learning to combine task and language-specific knowledge, our model enables zero-shot transfer for unseen languages and task-language combinations. Our experiments on a diverse set of languages demonstrate that Hyper-X achieves the best or competitive gain when a mixture of multiple resources is available, while on par with strong baseline in the standard scenario. Hyper-X is also considerably more efficient in terms of parameters and resources compared to methods that train separate adapters. Finally, Hyper-X consistently produces strong results in few-shot scenarios for new languages, showing the versatility of our approach beyond zero-shot transfer.", "track": "Multilinguality", "label": 13}, {"loc": [3.39912748336792, 4.042390823364258], "id": 2652, "title": "Towards Robust Numerical Question Answering: Diagnosing Numerical Capabilities of NLP Systems", "authors": "Jialiang Xu, Mengyu Zhou, Xinyi He, Shi Han and Dongmei Zhang", "abstract": "Numerical Question Answering is the task of answering questions that require numerical capabilities. Previous works introduce general adversarial attacks to Numerical Question Answering, while not systematically exploring numerical capabilities specific to the topic. In this paper, we propose to conduct numerical capability diagnosis on a series of Numerical Question Answering systems and datasets. A series of numerical capabilities are highlighted, and corresponding dataset perturbations are designed. Empirical results indicate that existing systems are severely challenged by these perturbations. E.g., Graph2Tree experienced a 53.83% absolute accuracy drop against the \"Extra\u201d perturbation on ASDiv-a, and BART experienced 13.80% accuracy drop against the \"Language\u201d perturbation on the numerical subset of DROP. As a counteracting approach, we also investigate the effectiveness of applying perturbations as data augmentation to relieve systems' lack of robust numerical capabilities. With experiment analysis and empirical studies, it is demonstrated that Numerical Question Answering with robust numerical capabilities is still to a large extent an open question. We discuss future directions of Numerical Question Answering and summarize guidelines on future dataset collection and system design.", "track": "Theme Track", "label": 18}, {"loc": [6.915184020996094, 9.573827743530273], "id": 2659, "title": "Enhancing Joint Multiple Intent Detection and Slot Filling with Global Intent-Slot Co-occurrence", "authors": "Mengxiao Song, Bowen Yu, Li Quangang, Wang Yubin, Tingwen Liu and Hongbo Xu", "abstract": "Multi-intent detection and slot filling joint model attracts more and more attention since it can handle multi-intent utterances, which is closer to complex real-world scenarios. Most existing joint models rely entirely on the training procedure to obtain the implicit correlation between intents and slots. However, they ignore the fact that leveraging the rich global knowledge in the corpus can determine the intuitive and explicit correlation between intents and slots. \nIn this paper, we aim to make full use of the statistical co-occurrence frequency between intents and slots as prior knowledge to enhance joint multiple intent detection and slot filling. To be specific, an intent-slot co-occurrence graph is constructed based on the entire training corpus to globally discover correlation between intents and slots. Based on the global intent-slot co-occurrence, we propose a novel graph neural network to model the interaction between the two subtasks. \nExperimental results on two public multi-intent datasets demonstrate that our approach outperforms the state-of-the-art models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.176275253295898, 8.134502410888672], "id": 2660, "title": "Towards Pragmatic Production Strategies for Natural Language Generation Tasks", "authors": "Mario Giulianelli", "abstract": "This position paper proposes a conceptual framework for the design of Natural Language Generation (NLG) systems that follow efficient and effective production strategies in order to achieve complex communicative goals. In this general framework, efficiency is characterised as the parsimonious regulation of production and comprehension costs while effectiveness is measured with respect to task-oriented and contextually grounded communicative goals. \nWe provide concrete suggestions for the estimation of goals, costs, and utility via modern statistical methods, demonstrating applications of our framework to the classic pragmatic task of visually grounded referential games and to abstractive text summarisation, two popular generation tasks with real-world applications. In sum, we advocate for the development of NLG systems that learn to make pragmatic production decisions from experience, by reasoning about goals, costs, and utility in a human-like way.", "track": "Theme Track", "label": 18}, {"loc": [5.7520575523376465, 11.786705017089844], "id": 2670, "title": "LiteVL: Efficient Video-Language Learning with Enhanced Spatial-Temporal Modeling", "authors": "Dongsheng Chen, Chaofan Tao, Lu Hou, Lifeng Shang, Xin Jiang and Qun Liu", "abstract": "Recent large-scale video-language pre-trained models have shown appealing performance on various downstream tasks. However, the pre-training process is computationally expensive due to the requirement of millions of video-text pairs and the redundant data structure of each video. To mitigate these problems, we propose LiteVL, which adapts a pre-trained image-language model BLIP into a video-text model directly on downstream tasks, without heavy pre-training. To enhance the temporal modeling lacking in the image-language model, we propose to add temporal attention modules in the image encoder of BLIP with dynamic temporal scaling. Besides the model-wise adaptation, we also propose a non-parametric pooling mechanism to adaptively reweight the fine-grained video embedding conditioned on the text. Experimental results on text-video retrieval and video question answering show that the proposed LiteVL even outperforms previous video-language pre-trained models by a clear margin, though without any video-language pre-training.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.195528984069824, 12.423458099365234], "id": 2676, "title": "Communication breakdown: On the low mutual intelligibility between human and neural captioning", "authors": "Roberto Dess\u00ec, Eleonora Gualdoni, Francesca Franzon, Gemma Boleda and Marco Baroni", "abstract": "We compare the 0-shot performance of a neural caption-based image retriever when given as input either human-produced captions or captions generated by a neural captioner. We conduct this comparison on the recently introduced ImageCoDe data-set (Krojer et al. 2022), which contains hard distractors nearly identical to the images to be retrieved. We find that the neural retriever has much higher performance when fed neural rather than human captions, despite the fact that the former, unlike the latter, were generated without awareness of the distractors that make the task hard. Even more remarkably, when the same neural captions are given to human subjects, their retrieval performance is almost at chance level. Our results thus add to the growing body of evidence that, even when the ``language'' of neural models resembles English, this superficial resemblance might be deeply misleading.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [10.670960426330566, 6.964401721954346], "id": 2688, "title": "Normalizing Mutual Information for Robust Adaptive Training for Translation", "authors": "Youngwon Lee, Changmin Lee, Hojin Lee and Seung-won Hwang", "abstract": "Despite the success of neural machine translation models, tensions between fluency of optimizing target language modeling and source-faithfulness remain as challenges. Previously, Conditional Bilingual Mutual Information (CBMI), a scoring metric for the importance of target sentences and tokens, was proposed to encourage fluent and faithful translations. The score is obtained by combining the probability from the translation model and the target language model, which is then used to assign different weights to losses from sentences and tokens. Meanwhile, we argue this metric is not properly normalized, for which we propose Normalized Pointwise Mutual Information (NPMI). NPMI utilizes an additional language model on source language to approximate the joint likelihood of source-target pair and the likelihood of the source, which is then used for normalizing the score. We showed that NPMI better captures the dependence between source-target and that NPMI-based token-level adaptive training brings improvements over baselines with empirical results from En-De, De-En, and En-Ro translation tasks.", "track": "Machine Translation", "label": 10}, {"loc": [10.632492065429688, 6.878334999084473], "id": 2690, "title": "Bilingual Synchronization: Restoring Translational Relationships with Editing Operations", "authors": "Jitao Xu, Josep Crego and Fran\u00e7ois Yvon", "abstract": "Machine Translation (MT) is usually viewed as a one-shot process that generates the target language equivalent of some source text from scratch. We consider here a more general setting which assumes an initial target sequence, that must be transformed into a valid translation of the source, thereby restoring parallelism between source and target. For this bilingual synchronization task, we consider several architectures (both autoregressive and non-autoregressive) and training regimes, and experiment with multiple practical settings such as simulated interactive MT, translating with Translation Memory (TM) and TM cleaning. Our results suggest that one single generic edit-based system, once fine-tuned, can compare with, or even outperform, dedicated systems specifically trained for these tasks.", "track": "Machine Translation", "label": 10}, {"loc": [7.550704479217529, 12.359099388122559], "id": 2696, "title": "Human-Machine Collaboration Approaches to Build a Dialogue Dataset for Hate Speech Countering", "authors": "Helena Bonaldi, Sara Dellantonio, Serra Sinem Tekiro\u011flu and Marco Guerini", "abstract": "Fighting online hate speech is a challenge that is usually addressed using Natural Language Processing via automatic detection and removal of hate content. Besides this approach, counter narratives have emerged as an effective tool employed by NGOs to respond to online hate on social media platforms. For this reason, Natural Language Generation is currently being studied as a way to automatize counter narrative writing. However, the existing resources necessary to train NLG models are limited to 2-turn interactions (a hate speech and a counter narrative as response), while in real life, interactions can consist of multiple turns. In this paper, we present a hybrid approach for dialogical data collection, which combines the intervention of human expert annotators over machine generated dialogues obtained using 19 different configurations. The result of this work is DIALOCONAN, the first dataset comprising over 3000 fictitious multi-turn dialogues between a hater and an NGO operator, covering 6 targets of hate.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.46934700012207, 7.771413803100586], "id": 2710, "title": "JANUS: Joint Autoregressive and Non-autoregressive Training with Auxiliary Loss for Sequence Generation", "authors": "Xiaobo Liang, Lijun Wu, Juntao Li and Min Zhang", "abstract": "Transformer-based autoregressive and non-autoregressive models have played an essential role in sequence generation tasks. The autoregressive model can obtain excellent performance, while the non-autoregressive model brings fast decoding speed for inference. In this paper, we propose \\textbf{JANUS}, a \\textbf{J}oint \\textbf{A}utoregressive and \\textbf{N}on-autoregressive training method using a\\textbf{U}xiliary los\\textbf{S} to enhance the model performance in both AR and NAR manner simultaneously and effectively alleviate the problem of distribution discrepancy.\nFurther, we pre-train BART with JANUS on a large corpus with minimal cost (16 GPU days) and make the BART-JANUS capable of non-autoregressive generation, demonstrating that our approach can transfer the AR knowledge to NAR. Empirically, we show our approach and BART-JANUS can achieve significant improvement on multiple generation tasks, including machine translation and GLGE benchmarks. Our code is available at Github\\footnote{\\url{https://github.com/dropreg/JANUS}}.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.427000522613525, 12.192925453186035], "id": 2725, "title": "Entity-Focused Dense Passage Retrieval for Outside-Knowledge Visual Question Answering", "authors": "Jialin Wu and Raymond Mooney", "abstract": "Most Outside-Knowledge Visual Question Answering (OK-VQA) systems employ a two-stage framework that first retrieves external knowledge given the visual question and then predicts the answer based on the retrieved content. However, the retrieved knowledge is often inadequate. Retrievals are frequently too general and fail to cover specific knowledge needed to answer the question. Also, the naturally available supervision (whether the passage contains the correct answer) is weak and does not guarantee question relevancy. To address these issues, we propose an Entity-Focused Retrieval (EnFoRe) model that provides stronger supervision during training and recognizes question-relevant entities to help retrieve more specific knowledge. Experiments show that our EnFoRe model achieves superior retrieval performance on OK-VQA, the currently largest outside-knowledge VQA dataset. We also combine the retrieved knowledge with state-of-the-art VQA models, and achieve a new state-of-the-art performance on OK-VQA.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [9.146728515625, 6.326391696929932], "id": 2728, "title": "Cross-Linguistic Syntactic Difference in Multilingual BERT: How Good is It and How Does It Affect Transfer?", "authors": "Ningyu Xu, Tao Gui, Ruotian Ma, Qi Zhang, Jingting Ye, Menghan Zhang and Xuanjing Huang", "abstract": "Multilingual BERT (mBERT) has demonstrated considerable cross-lingual syntactic ability, whereby it enables effective zero-shot cross-lingual transfer of syntactic knowledge. The transfer is more successful between some languages, but it is not well understood what leads to this variation and whether it fairly reflects difference between languages. In this work, we investigate the distributions of grammatical relations induced from mBERT in the context of 24 typologically different languages. We demonstrate that the distance between the distributions of different languages is highly consistent with the syntactic difference in terms of linguistic formalisms. Such difference learnt via self-supervision plays a crucial role in the zero-shot transfer performance and can be predicted by variation in morphosyntactic properties between languages. These results suggest that mBERT properly encodes languages in a way consistent with linguistic diversity and provide insights into the mechanism of cross-lingual transfer.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.562887668609619, 12.31489372253418], "id": 2734, "title": "\"It's Not Just Hate\": A Multi-Dimensional Perspective on Detecting Harmful Speech Online", "authors": "Federico Bianchi, Stefanie HIlls, Patricia Rossini, Dirk Hovy, Rebekah Tromble and Nava Tintarev", "abstract": "Well-annotated data is a prerequisite for good Natural Language Processing models. Too often, though, annotation decisions are governed by optimizing time or annotator agreement. We make a case for nuanced efforts in an interdisciplinary setting for annotating offensive online speech. Detecting offensive content is rapidly becoming one of the most important real-world NLP tasks. However, most datasets use a single binary label, e.g., for hate or incivility, even though each concept is multi-faceted. This modeling choice severely limits nuanced insights, but also performance.\nWe show that a more fine-grained multi-label approach to predicting incivility and hateful or intolerant content addresses both conceptual and performance issues.\nWe release a novel dataset of over 40,000 tweets about immigration from the US and UK, annotated with six labels for different aspects of incivility and intolerance.\nOur dataset not only allows for a more nuanced understanding of harmful speech online, models trained on it also outperform or match performance on benchmark datasets", "track": "NLP Applications", "label": 0}, {"loc": [4.907902240753174, 9.007170677185059], "id": 2758, "title": "Long Text Generation with Topic-aware Discrete Latent Variable Model", "authors": "Erguang Yang, Mingtong Liu, Deyi Xiong, YUJIE ZHANG, Yufeng Chen and Jinan Xu", "abstract": "Generating coherent long texts is an important yet challenging task, particularly for\nthe open-ended generation. Prior work based on discrete latent codes focuses on the modeling of discourse relation, resulting in discrete codes only learning shallow semantics (Ji and Huang, 2021). \nA natural text always revolves around several related topics and the transition across them is natural and smooth.\nIn this work, we investigate whether discrete latent codes can learn information of topics. To this end, we build a topic-aware latent code-guided text generation model. To encourage discrete codes to model information about topics, we propose a span-level bag-of-words training objective for the model. Automatic and manual evaluation experiments show that our method can generate more topic-relevant and coherent texts.", "track": "Natural Language Generation", "label": 6}, {"loc": [1.796485185623169, 5.315113544464111], "id": 2761, "title": "TIARA: Multi-grained Retrieval for Robust Question Answering over Large Knowledge Base", "authors": "Yiheng Shu, Zhiwei Yu, Yuhan Li, B\u00f6rje Karlsson, Tingting Ma, Yuzhong Qu and Chin-Yew Lin", "abstract": "Pre-trained language models (PLMs) have shown their effectiveness in multiple scenarios. However, KBQA remains challenging, especially regarding coverage and generalization settings. This is due to two main factors: i) understanding the semantics of both questions and relevant knowledge from the KB; ii) generating executable logical forms with both semantic and syntactic correctness. In this paper, we present a new KBQA model, TIARA, which addresses those issues by applying multi-grained retrieval to help the PLM focus on the most relevant KB context, viz., entities, exemplary logical forms, and schema items. Moreover, constrained decoding is used to control the output space and reduce generation errors. Experiments over important benchmarks demonstrate the effectiveness of our approach. TIARA outperforms previous SOTA, including those using PLMs or oracle entity annotations, by at least 4.1 and 1.1 F1 points on GrailQA and WebQuestionsSP, respectively. Specifically on GrailQA, TIARA outperforms previous models in all categories, with an improvement of 4.7 F1 points in zero-shot generalization.", "track": "Question Answering", "label": 11}, {"loc": [4.028550624847412, 3.9718105792999268], "id": 2767, "title": "Structure-Unified M-Tree Coding Solver for Math Word Problem", "authors": "bin wang, Jiangzhou Ju, Yang Fan, Xinyu Dai, Shujian Huang and Jiajun CHEN", "abstract": "As one of the challenging NLP tasks, designing math word problem (MWP) solvers has attracted increasing research attention for the past few years. In previous work, models designed by taking into account the properties of the binary tree structure of mathematical expressions at the output side have achieved better performance. However, the expressions corresponding to a MWP are often diverse (e.g., $n_1+n_2 \\times n_3-n_4$, $n_3\\times n_2-n_4+n_1$, etc.), and so are the corresponding binary trees, which creates difficulties in model learning due to the non-deterministic output space. In this paper, we propose the Structure-Unified M-Tree Coding Solver (SUMC-Solver), which applies a tree with any M branches (M-tree) to unify the output structures. To learn the M-tree, we use a mapping to convert the M-tree into the M-tree codes, where codes store the information of the paths from tree root to leaf nodes and the information of leaf nodes themselves, and then devise a Sequence-to-Code (seq2code) model to generate the codes. Experimental results on the widely used MAWPS and Math23K datasets have demonstrated that SUMC-Solver not only outperforms several state-of-the-art models under similar experimental settings but also performs much better under low-resource conditions.", "track": "Question Answering", "label": 11}, {"loc": [2.6793723106384277, 4.819819450378418], "id": 2773, "title": "FormLM: Recommending Creation Ideas for Online Forms by Modelling Semantic and Structural Information", "authors": "Yijia Shao, Mengyu Zhou, Yifan Zhong, Tao Wu, Hongwei Han, Shi Han, Gideon Huang and Dongmei Zhang", "abstract": "Online forms are widely used to collect data from human and have a multi-billion market. Many software products provide online services for creating semi-structured forms where questions and descriptions are organized by predefined structures. However, the design and creation process of forms is still tedious and requires expert knowledge. To assist form designers, in this work we present FormLM to model online forms (by enhancing pre-trained language model with form structural information) and recommend form creation ideas (including question / options recommendations and block type suggestion). For model training and evaluation, we collect the first public online form dataset with 62K online forms. Experiment results show that FormLM significantly outperforms general-purpose language models on all tasks, with an improvement by 4.71 on Question Recommendation and 10.6 on Block Type Suggestion in terms of ROUGE-1 and Macro-F1, respectively.", "track": "NLP Applications", "label": 0}, {"loc": [8.031818389892578, 5.735328197479248], "id": 2785, "title": "Generate, Discriminate and Contrast: A Semi-Supervised Sentence Representation Learning Framework", "authors": "Yiming Chen, Yan Zhang, Bin Wang, ZUOZHU LIU and Haizhou Li", "abstract": "Most sentence embedding techniques heavily rely on expensive human-annotated sentence pairs as the supervised signals. Despite the use of large-scale unlabeled data, the performance of unsupervised methods typically lags far behind that of the supervised counterparts in most downstream tasks. In this work, we propose a semi-supervised sentence embedding framework, GenSE, that effectively leverages large-scale unlabeled data. Our method include three parts: 1) Generate: A generator/discriminator model is jointly trained to synthesize sentence pairs from open-domain unlabeled corpus; 2) Discriminate: Noisy sentence pairs are filtered out by the discriminator to acquire high-quality positive and negative sentence pairs; 3) Contrast: A prompt-based contrastive approach is presented for sentence representation learning with both annotated and synthesized data. Comprehensive experiments show that GenSE achieves an average correlation score of 85.19 on the STS datasets and consistent performance improvement on four domain adaptation tasks, significantly surpassing the state-of-the-art methods and convincingly corroborating its effectiveness and generalization ability.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.112783432006836, 9.416691780090332], "id": 2786, "title": "GPS: Genetic Prompt Search for Efficient Few-Shot Learning", "authors": "Hanwei Xu, Yujun Chen, Yulun Du, Nan Shao, wang yanggang, Haiyu Li and Zhilin Yang", "abstract": "Prompt-based techniques have demostrated great potential for improving the few-shot generalization of pretrained language models. However, their performance heavily relies on the manual design of prompts and thus requiring a lot of human efforts. In this paper, we introduce Genetic Prompt Search (GPS) to improve few-shot learning with prompts, which utilizes a genetic algorithm to automatically search for the best prompt.\nGPS is gradient-free and requires no update of model parameters but only a small validation set. \nExperiments on diverse datasets proved the effectiveness of GPS, which outperforms manual prompts by a large margin of 2.6 points. \nOur method is also better than other parameter-efficient tuning methods such as prompt tuning.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.210636615753174, 5.173218250274658], "id": 2793, "title": "Multitask Instruction-based Prompting for Fallacy Recognition", "authors": "Tariq Alhindi, Tuhin Chakrabarty, Elena Musi and Smaranda Muresan", "abstract": "Fallacies are used as seemingly valid arguments to support a position and persuade the audience about its validity. Recognizing fallacies is an intrinsically difficult task both for humans and machines. Moreover, a big challenge for computational models lies in the fact that fallacies are formulated differently across the datasets with differences in the input format (e.g., question-answer pair, sentence with fallacy fragment), genre (e.g., social media, dialogue, news), as well as types and number of fallacies (from 5 to 18 types per dataset). To move towards solving the fallacy recognition task, we approach these differences across datasets as multiple tasks and show how instruction-based prompting in a multitask setup based on the T5 model improves the results against approaches built for a specific dataset such as T5, BERT or GPT-3. We show the ability of this multitask prompting approach to recognize 28 unique fallacies across domains and genres and study the effect of model size and prompt choice by analyzing the per-class (i.e., fallacy type) results. Finally, we analyze the effect of annotation quality on model performance, and the feasibility of complementing this approach with external knowledge.", "track": "NLP Applications", "label": 0}, {"loc": [5.596348285675049, 11.634411811828613], "id": 2804, "title": "Rethinking Multi-Modal Alignment in Multi-Choice VideoQA from Feature and Sample Perspectives", "authors": "Shaoning Xiao, Long Chen, Kaifeng Gao, Zhao Wang, Yi Yang, Zhimeng Zhang and Jun Xiao", "abstract": "Reasoning about causal and temporal event relations in videos is a new destination of Video Question Answering (VideoQA). The major stumbling block to achieve this purpose is the semantic gap between language and video since they are at different levels of abstraction. Existing efforts mainly focus on designing sophisticated architectures while utilizing frame- or object-level visual representations. In this paper, we reconsider the multi-modal alignment problem in VideoQA from feature and sample perspectives to achieve better performance. From the view of feature, we break down the video into trajectories and first leverage trajectory feature in VideoQA to enhance the alignment between two modalities. Moreover, we adopt a heterogeneous graph architecture and design a hierarchical framework to align both trajectory-level and frame-level visual feature with language feature. In addition, we found that VideoQA models are largely dependent on language\npriors and always neglect visual-language interactions. Thus, two effective yet portable training augmentation strategies are designed to strengthen the cross-modal correspondence ability of our model from the view of sample. Extensive results show that our method outperforms all the state-of the-art models on the challenging NExT-QA benchmark.", "track": "Question Answering", "label": 11}, {"loc": [4.305737495422363, 4.950861930847168], "id": 2808, "title": "Towards Table-to-Text Generation with Pretrained Language Model: A Table Structure Understanding and Text Deliberating Approach", "authors": "Miao Chen, Xinjiang Lu, Tong Xu, Yanyan Li, zhou jingbo, Dejing Dou and Hui Xiong", "abstract": "Although remarkable progress on the neural table-to-text methods has been made, the generalization issues hinder the applicability of these models due to the limited source tables. Large-scale pretrained language models sound like a promising solution to tackle such issues. However, how to effectively bridge the gap between the structured table and the text input by fully leveraging table information to fuel the pretrained model is still not well explored. Besides, another challenge of integrating the deliberation mechanism into the text-to-text pretrained model for solving the table-to-text task remains seldom studied. In this paper, to implement the table-to-text generation with pretrained language model, we propose a table structure understanding and text deliberating approach, namely TASD. To be specific, we devise a three-layered multi-head attention network to realize the table-structureaware text generation model with the help of the pretrained language model. Furthermore, a multi-pass decoder framework is adopted to enhance the capability of polishing generated text for table descriptions. The empirical studies, as well as human evaluation, on two public datasets, validate that our approach can generate faithful and fluent descriptive texts for different types of tables.", "track": "Natural Language Generation", "label": 6}, {"loc": [10.352788925170898, 7.4744648933410645], "id": 2820, "title": "Hierarchical Phrase-Based Sequence-to-Sequence Learning", "authors": "Bailin Wang, Ivan Titov, Jacob Andreas and Yoon Kim", "abstract": "This paper describes a neural transducer that maintains the flexibility of standard sequence-to-sequence (seq2seq) models while incorporating hierarchical phrases as a source of inductive bias during training and as explicit constraints during inference. Our approach trains two models: a discriminative parser based on a bracketing transduction grammar whose derivation tree hierarchically aligns source and target phrases, and a neural seq2seq model that learns to translate the aligned phrases one-by-one. We use the same seq2seq model to translate at all phrase scales, which results in two inference modes: one mode in which the parser is discarded and only the seq2seq component is used at the sequence-level, and another in which the parser is combined with the seq2seq model. Decoding in the latter mode is done with the cube-pruned CKY algorithm, which is more involved but can make use of new translation rules during inference. We formalize our model as a source-conditioned synchronous grammar and develop an efficient variational inference algorithm for training. When applied on top of both randomly initialized and pretrained seq2seq models, we find that it performs well compared to baselines on small scale machine translation benchmarks.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [4.5952630043029785, 4.592576503753662], "id": 2832, "title": "Natural Language Deduction with Incomplete Information", "authors": "Zayne R. Sprague, Kaj Bostrom, Swarat Chaudhuri and Greg Durrett", "abstract": "A growing body of work studies how to answer a question or verify a claim by generating a natural language \"proof:\u201d a chain of deductive inferences yielding the answer based on a set of premises. However, these methods can only make sound deductions when they follow from evidence that is given. We propose a new system that can handle the underspecified setting where not all premises are stated at the outset; that is, additional assumptions need to be materialized to prove a claim. By using a natural language generation model to abductively infer a premise given another premise and a conclusion, we can impute missing pieces of evidence needed for the conclusion to be true. Our system searches over two fringes in a bidirectional fashion, interleaving deductive (forward-chaining) and abductive (backward-chaining) generation steps. We sample multiple possible outputs for each step to achieve coverage of the search space, at the same time ensuring correctness by filtering low-quality generations with a round-trip validation procedure. Results on a modified version of the EntailmentBank dataset and a new dataset called Everyday Norms: Why Not? Show that abductive generation with validation can recover premises across in- and out-of-domain settings.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [4.92696475982666, 12.451148986816406], "id": 2833, "title": "Character-centric Story Visualization via Visual Planning and Token Alignment", "authors": "Hong Chen, Rujun Han, Te-Lin Wu, Hideki Nakayama and Nanyun Peng", "abstract": "Story visualization advances the traditional text-to-image generation by enabling multiple image generation based on a complete story. This task requires machines to 1) understand long text inputs, and 2) produce a globally consistent image sequence that illustrates the contents of the story. A key challenge of consistent story visualization is to preserve characters that are essential in stories. To tackle the challenge, we propose to adapt a recent work that augments VQ-VAE with a text-to-visual-token (transformer) architecture. Specifically, we modify the text-to-visual-token module with a two-stage framework: 1) character token planning model that predicts the visual tokens for characters only; 2) visual token completion model that generates the remaining visual token sequence, which is sent to VQ-VAE for finalizing image generations. To encourage characters to appear in the images, we further train the two-stage framework with a character-token alignment objective. Extensive experiments and evaluations demonstrate that the proposed method excels at preserving characters and can produce higher quality image sequences compared with the strong baselines.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [3.5250167846679688, 4.648708343505859], "id": 2834, "title": "ASQA: Factoid Questions Meet Long-Form Answers", "authors": "Ivan Stelmakh, Yi Luan, Bhuwan Dhingra and Ming-Wei Chang", "abstract": "Recent progress on open domain factoid question answering (QA) does not easily transfer to the task of long-form QA, where the goal is to answer questions that require in-depth explanations. The hurdles include a lack of high-quality data and the absence of a well-defined notion of an answer's quality. In this work, we address these problems by releasing a novel dataset and a task that we call ASQA (Answer Summaries for Questions which are Ambiguous); and proposing a reliable metric for measuring performance on ASQA. Our task focuses on ambiguous factoid questions which have different correct answers depending on the interpretation. Answers to ambiguous questions should combine factual information from multiple sources into a coherent long-form summary that resolves the ambiguity. In contrast to existing long-form QA tasks (such as ELI5), ASQA admits a clear notion of correctness: a user faced with a good summary should be able to answer different interpretations of the original ambiguous question. Our analysis demonstrates an agreement between this metric and human judgments, and reveals a considerable gap between human performance and strong baselines.", "track": "Question Answering", "label": 11}, {"loc": [8.257851600646973, 6.715014934539795], "id": 2837, "title": "Algorithms for Acyclic Weighted Finite-State Automata with Failure Arcs", "authors": "Anej Svete, Benjamin Dayan, Ryan Cotterell, Tim Vieira and Jason Eisner", "abstract": "Weighted finite-state automata (WSFAs) are\ncommonly used in NLP. Failure transitions are\na useful extension for compactly representing\nbackoffs or interpolation in n-gram models\nand CRFs, which are special cases of WFSAs.\nUnfortunately, applying standard algorithms\nfor computing the pathsum requires expand-\ning these compact failure transitions. As a\nresult, na \u0308\u0131ve computation of the pathsum in\nacyclic WFSAs with failure transitions runs in\nO(|Q|2|\u03a3|) (O(|Q||\u03a3|) for deterministic WF-\nSAs) while the equivalent algorithm in normal\nWFSAs runs in O(|E|), where E represents\nthe set of transitions, Q the set of states, and\n\u03a3 the alphabet. In this work, we present more\nefficient algorithms for computing the pathsum\nin sparse acyclic WFSAs, i.e., WFSAs with av-\nerage out symbol fraction s \u226a 1. In those,\nbackward runs in O(s|Q||\u03a3|). We propose\nan algorithm for semiring-weighted automata\nwhich runs in O(|E| + s|\u03a3||Q||Tmax| log |\u03a3|),\nwhere |Tmax| is the size of the largest con-\nnected component of failure transitions. Ad-\nditionally, we propose faster algorithms for\ntwo specific cases. For ring-weighted WF-\nSAs we propose an algorithm with complex-\nity O(|E| + s|\u03a3||Q||\u03c0max|), where |\u03c0max| de-\nnotes the longest path length of failure transi-\ntions stemming from q and \u03a3(q) the set of sym-\nbols on the outgoing transitions from q. For\nsemiring-weighted WFSAs whose failure tran-\nsition topology satisfies a condition exemplified\nby CRFs, we propose an algorithm with com-\nplexity O(|E| + s|\u03a3||Q| log |\u03a3|).", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [0.7406312227249146, 8.00481128692627], "id": 2847, "title": "Towards Better Document-level Relation Extraction via Iterative Inference", "authors": "Liang Zhang, Jinsong Su, Yidong Chen, Zhongjian Miao, Min Zijun, Hu Qingguo and xiaodong shi", "abstract": "Document-level relation extraction (RE) aims to extract the relations between entities from the input document that usually containing many difficultly-predicted entity pairs whose relations can only be predicted through relational inference. Existing methods usually directly predict the relations of all entity pairs of input document in a one-pass manner, ignoring the fact that predictions of some entity pairs heavily depend on the predicted results of other pairs. To deal with this issue, in this paper, we propose a novel document-level RE model with iterative inference. Our model is mainly composed of two modules: 1) a base module expected to provide preliminary relation predictions on entity pairs; 2) an inference module introduced to refine these preliminary predictions by iteratively dealing with difficultly-predicted entity pairs depending on other pairs in an easy-to-hard manner. Unlike previous methods which only consider feature information of entity pairs, our inference module is equipped with two Extended Cross Attention units, allowing it to exploit both feature information and previous predictions of entity pairs during relational inference. Furthermore, we adopt a two-stage strategy to train our model. At the first stage, we only train our base module. During the second stage, we train the whole model, where contrastive learning is introduced to enhance the training of inference module. Experimental results on three commonly-used datasets show that our model consistently outperforms other competitive baselines.", "track": "Information Extraction", "label": 5}, {"loc": [7.580563068389893, 3.696763753890991], "id": 2864, "title": "Efficient Adversarial Training with Robust Early-Bird Tickets", "authors": "zhiheng xi, rui zheng, Tao Gui, Qi Zhang and Xuanjing Huang", "abstract": "Adversarial training is one of the most powerful methods to improve the robustness of pre-trained language models (PLMs). However, this approach is typically more expensive than traditional fine-tuning because of the necessity to generate adversarial examples via gradient descent. Delving into the optimization process of adversarial training, we find that robust connectivity patterns emerge in the early training phase (typically $0.15\\sim0.3$ epochs), far before parameters converge. Inspired by this finding, we dig out robust early-bird tickets (i.e., subnetworks) to develop an efficient adversarial training method: (1) searching for robust tickets with structured sparsity in the early stage; (2) fine-tuning robust tickets in the remaining time. To extract the robust tickets as early as possible, we design a ticket convergence metric to automatically terminate the searching process. Experiments show that the proposed efficient adversarial training method can achieve up to $7\\times \\sim 13 \\times$ training speedups while maintaining comparable or even better robustness compared to the most competitive state-of-the-art adversarial training methods.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [10.199402809143066, 6.763775825500488], "id": 2867, "title": "SMaLL-100: Introducing Shallow Multilingual Machine Translation Model for Low-Resource Languages", "authors": "Alireza Mohammadshahi, Vassilina Nikoulina, Alexandre Berard, Caroline Brun, James Henderson and Laurent Besacier", "abstract": "In recent years, multilingual machine translation models have achieved promising performance on low-resource language pairs by sharing information between similar languages, thus enabling zero-shot translation. To overcome the \"curse of multilinguality\", these models often opt for scaling up the number of parameters, which makes their use in resource-constrained environments challenging. We introduce SMaLL-100, a distilled version of the M2M-100(12B) model, a massively multilingual machine translation model covering 100 languages. We train SMaLL-100 with uniform sampling across all language pairs and therefore focus on preserving the performance of low-resource languages. We evaluate SMaLL-100 on different low-resource benchmarks: FLORES-101, Tatoeba, and TICO-19 and demonstrate that it outperforms previous massively multilingual models of comparable sizes (200-600M) while improving inference latency and memory usage. Additionally, our model achieves comparable results to M2M-100 (1.2B), while being 3.6x smaller and 4.3x faster at inference.", "track": "Machine Translation", "label": 10}, {"loc": [8.111284255981445, 3.0337202548980713], "id": 2869, "title": "TextFusion: Privacy-Preserving Pre-trained Model Inference via Token Fusion", "authors": "Xin Zhou, Jinzhu Lu, Tao Gui, Ruotian Ma, Zichu Fei, Yuran Wang, Yong Ding, Yibo Cheung, Qi Zhang and Xuanjing Huang", "abstract": "Recently, more and more pre-trained language models are released as a cloud service. It allows users who lack computing resources to perform inference with a powerful model by uploading data to the cloud. The plain text may contain private information, as the result, users prefer to do partial computations locally and upload intermediate representations to the cloud for subsequent inference.\nHowever, recent studies have shown that intermediate representations can also be recovered to plain text with reasonable accuracy, thus the risk of privacy leakage still exists. \nTo address this issue, we propose TextFusion, a novel method for preserving inference privacy.\nSpecifically, we train a Fusion Predictor to dynamically fuse token representations, which hides multiple private token representations behind an unrecognizable one.\nFurthermore, an adversarial training regime is employed to privatize these representations. In this way, the cloud only receives incomplete and perturbed representations, making it difficult to accurately recover the complete plain text.\nThe experimental results on diverse classification tasks show that our approach can effectively preserve inference privacy without significantly sacrificing performance in different scenarios.", "track": "Ethics", "label": 21}, {"loc": [5.1357526779174805, 4.940208911895752], "id": 2877, "title": "Learning to Explain Selectively: A Case Study on Question Answering", "authors": "Shi Feng and Jordan Boyd-Graber", "abstract": "Explanations promise to bridge the gap between humans and AI, yet it remains difficult to achieve consistent improvement in AI-augmented human decision making. The usefulness of AI explanations depends on many factors, and always showing the same type of explanation in all cases is suboptimal\u2014so is relying on heuristics to adapt explanations for each scenario. We propose learning to explain selectively: for each decision that the user makes, we use a model to choose the best explanation from a set of candidates, and update this model with feedback to optimize human performance. We experiment on a question answering task, Quizbowl, and show that selective explanations improve human performance for both experts and crowdworkers.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [10.37665843963623, 7.070738315582275], "id": 2878, "title": "ConsistTL: Modeling Consistency in Transfer Learning for Low-Resource Neural Machine Translation", "authors": "Zhaocong Li, Xuebo Liu, Derek F. Wong, Lidia S. Chao and Min Zhang", "abstract": "Transfer learning is a simple and powerful method that can be used to boost model performance of low-resource neural machine translation (NMT). Existing transfer learning methods for NMT are static, which simply transfer knowledge from a parent model to a child model once via parameter initialization. In this paper, we propose a novel transfer learning method for NMT, namely ConsistTL, which can continuously transfer knowledge from the parent model during the training of the child model. Specifically, for each training instance of the child model, ConsistTL constructs the semantically-equivalent instance for the parent model and encourages prediction consistency between the parent and child for this instance, which is equivalent to the child model learning each instance under the guidance of the parent model. Experimental results on five low-resource NMT tasks demonstrate that ConsistTL results in significant improvements over strong transfer learning baselines, with a gain up to 1.7 BLEU over the existing back-translation model on the widely-used WMT17 Turkish-English benchmark. Further analysis reveals that ConsistTL can improve the inference calibration of the child model. Code and scripts are freely available at https://github.com/NLP2CT/ConsistTL.", "track": "Machine Translation", "label": 10}, {"loc": [6.13837194442749, 5.379065990447998], "id": 2894, "title": "Better Hit the Nail on the Head than Beat around the Bush: Removing Protected Attributes with a Single Projection", "authors": "Pantea Haghighatkhah, Antske Fokkens, Pia Sommerauer, Bettina Speckmann and Kevin Verbeek", "abstract": "Bias elimination and recent probing studies attempt to remove specific information from embedding spaces. \nHere it is important to remove as much of the target information as possible, while preserving any other information present. \nINLP is a popular recent method which removes specific information through iterative nullspace projections.\nMultiple iterations, however, increase the risk that information other than the target is negatively affected.\nWe introduce two methods that find a single targeted projection: Mean Projection (MP, more efficient) and Tukey Median Projection (TMP, with theoretical guarantees). \nOur comparison between MP and INLP shows that (1) one MP projection removes linear separability based on the target and (2) MP has less impact on the overall space.\nFurther analysis shows that applying random projections after MP leads to the same overall effects on the embedding space as the multiple projections of INLP. \nApplying one targeted (MP) projection hence is methodologically cleaner than applying multiple (INLP) projections that introduce random effects.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [0.9153674840927124, 7.923649787902832], "id": 2896, "title": "IELM: An Open Information Extraction Benchmark for Pre-Trained Language Models", "authors": "Chenguang Wang, Xiao Liu and Dawn Song", "abstract": "We introduce a new open information extraction (OIE) benchmark for pre-trained language models (LM). Recent studies have demonstrated that pre-trained LMs, such as BERT and GPT, may store linguistic and relational knowledge. In particular, LMs are able to answer \"fill-in-the-blank\" questions when given a pre-defined relation category. Instead of focusing on pre-defined relations, we create an OIE benchmark aiming to fully examine the open relational information present in the pre-trained LMs. We accomplish this by turning pre-trained LMs into zero-shot OIE systems. Surprisingly, pre-trained LMs are able to obtain competitive performance on both standard OIE datasets (CaRB and Re-OIE2016) and two new large-scale factual OIE datasets (TAC KBP-OIE and Wikidata-OIE) that we establish via distant supervision. For instance, the zero-shot pre-trained LMs outperform the F1 score of the state-of-the-art supervised OIE methods on our factual OIE datasets without needing to use any training sets.", "track": "Information Extraction", "label": 5}, {"loc": [1.7339813709259033, 9.104886054992676], "id": 2898, "title": "ConNER: Consistency Training for Cross-lingual Named Entity Recognition", "authors": "Ran Zhou, Xin Li, Lidong Bing, Erik Cambria, Luo Si and Chunyan Miao", "abstract": "Cross-lingual named entity recognition (NER) suffers from data scarcity in the target languages, especially under zero-shot settings. \nExisting translate-train or knowledge distillation methods attempt to bridge the language gap, but often introduce a high level of noise. To solve this problem, consistency training methods regularize the model to be robust towards perturbations on data or hidden states.\nHowever, such methods are likely to violate the consistency hypothesis, or mainly focus on coarse-grain consistency.\nWe propose ConNER as a novel consistency training framework for cross-lingual NER, which comprises of: (1) translation-based consistency training on unlabeled target-language data, and (2) dropout-based consistency training on labeled source-language data. \nConNER effectively leverages unlabeled target-language data and alleviates overfitting on the source language to enhance the cross-lingual adaptability. \nExperimental results show our ConNER achieves consistent improvement over various baseline methods.", "track": "Multilinguality", "label": 13}, {"loc": [1.6994454860687256, 5.383082389831543], "id": 2915, "title": "A Sequential Flow Control Framework for Multi-hop Knowledge Base Question Answering", "authors": "Minghui Xie, Chuzhan Hao and Peng Zhang", "abstract": "One of the key challenges of knowledge base question answering (KBQA) is the multi-hop reasoning. Since in different hops, one attends to different parts of question, it is important to dynamically represent the question semantics for each hop. Existing methods, however, (i) infer the dynamic question representation only through coarse-grained attention mechanisms, which may bring information loss, (ii) and have not effectively modeled the sequential logic, which is crucial for the multi-hop reasoning process in KBQA.\nTo address these issues, we propose a sequential reasoning self-attention mechanism to capture the crucial reasoning information of each single hop in a more fine-grained way. Based on Gated Recurrent Unit (GRU) which is good at modeling sequential process, we propose a simple but effective GRU-inspired Flow Control (GFC) framework to model sequential logic in the whole multi-hop process.\nExtensive experiments on three popular benchmark datasets have demonstrated the superior effectiveness of our model. In particular, GFC achieves new state-of-the-art Hits@1 of 76.8% on WebQSP and is also effective when KB is incomplete. Our code and data are available at https://github.com/Xie-Minghui/GFC.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [4.749297618865967, 3.271265745162964], "id": 2916, "title": "ACENet: Attention Guided Commonsense Reasoning on Hybrid Knowledge Graph", "authors": "Chuzhan Hao, Minghui Xie and Peng Zhang", "abstract": "Augmenting pre-trained language models (PLMs) with knowledge graphs (KGs) has demonstrated superior performance on commonsense reasoning. Given a commonsense based QA context (question and multiple choices), existing approaches usually estimate the plausibility of candidate choices separately based on their respective retrieved KGs, without considering the interference among different choices. \nIn this paper, we propose an Attention guided Commonsense rEasoning Network (ACENet)\\footnote{\\url{https://github.com/HAOChuzhan/ACENet}.} to endow the neural network with the capability of integrating hybrid knowledge. Specifically, our model applies the multi-layer interaction of answer choices to continually strengthen correct choice information and guide the message passing of GNN. In addition, we also design a mix attention mechanism of nodes and edges to iteratively select supporting evidence on hybrid knowledge graph. Experimental results demonstrate the effectiveness of our proposed model through considerable performance gains across CommonsenseQA and OpenbookQA datasets.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [0.7506502270698547, 7.994113922119141], "id": 2935, "title": "Revisiting DocRED - Addressing the False Negative Problem in Relation Extraction", "authors": "Qingyu Tan, Lu Xu, Lidong Bing, Hwee Tou Ng and Sharifah Mahani Aljunied", "abstract": "The DocRED dataset is one of the most popular and widely used benchmarks for document-level relation extraction (RE). It adopts a recommend-revise annotation scheme so as to have a large-scale annotated dataset. However, we find that the annotation of DocRED is incomplete, i.e., false negative samples are prevalent. We analyze the causes and effects of the overwhelming false negative problem in the DocRED dataset. To address the shortcoming, we re-annotate 4,053 documents in the DocRED dataset by adding the missed relation triples back to the original DocRED. We name our revised DocRED dataset Re-DocRED. We conduct extensive experiments with state-of-the-art neural models on both datasets, and the experimental results show that the models trained and evaluated on our Re-DocRED achieve performance improvements of around 13 F1 points. Moreover, we conduct a comprehensive analysis to identify the potential areas for further improvement.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.804398775100708, 9.940906524658203], "id": 2938, "title": "Towards Summary Candidates Fusion", "authors": "Mathieu Ravaut, Shafiq Joty and Nancy Chen", "abstract": "Sequence-to-sequence deep neural models fine-tuned for abstractive summarization can achieve great performance on datasets with enough human annotations. Yet, it has been shown that they have not reached their full potential, with a wide gap between the top beam search output and the oracle beam. Recently, re-ranking methods have been proposed, to learn to select a better summary candidate. However, such methods are limited by the summary quality aspects captured by the first-stage candidates. To bypass this limitation, we propose a new paradigm in second-stage abstractive summarization called SummaFusion that fuses several summary candidates to produce a novel abstractive second-stage summary. Our method works well on several summarization datasets, improving both the ROUGE scores and qualitative properties of fused summaries. It is especially good when the candidates to fuse are worse, such as in the few-shot setup where we set a new state-of-the art. We will make our code and checkpoints available at https://github.com/ntunlp/SummaFusion/.", "track": "Summarization", "label": 14}, {"loc": [6.224789142608643, 12.408352851867676], "id": 2942, "title": "Multimodal Robustness for Neural Machine Translation", "authors": "Yuting Zhao and Ioan Calapodescu", "abstract": "In this paper, we look at the case of a Generic text-to-text NMT model that has to deal with data coming from various modalities, like speech, images, or noisy text extracted from the web. We propose a two-step method, based on composable adapters, to deal with this problem of Multimodal Robustness. In a first step, we separately learn domain adapters and modality specific adapters, to deal with noisy input coming from various sources: ASR, OCR, or noisy text (UGC). In a second step, we combine these components at runtime via dynamic routing or, when the source of noise is unknown, via two new transfer learning mechanisms (Fast Fusion and Multi Fusion). We show that our method provides a flexible, state-of-the-art, architecture able to deal with noisy multimodal inputs.", "track": "Machine Translation", "label": 10}, {"loc": [0.46370312571525574, 7.119673252105713], "id": 2950, "title": "TranSHER: Translating Knowledge Graph Embedding with Hyper-Ellipsoidal Restriction", "authors": "Yizhi Li, Wei Fan, Chao Liu, Chenghua Lin and Jiang Qian", "abstract": "Knowledge graph embedding methods are important for the knowledge graph completion (or link prediction) task.\nOne state-of-the-art method, PairRE, leverages two separate vectors to model complex relations (i.e., 1-to-N, N-to-1, and N-to-N) in knowledge graphs. \nHowever, such a method strictly restricts entities on the hyper-ellipsoid surfaces which limits the optimization of entity distribution, leading to suboptimal performance of knowledge graph completion. \nTo address this issue, we propose a novel score function TranSHER, which leverages relation-specific translations between head and tail entities to relax the constraint of hyper-ellipsoid restrictions. \nBy introducing an intuitive and simple relation-specific translation, TranSHER can provide more direct guidance on optimization and capture more semantic characteristics of entities with complex relations. \nExperimental results show that TranSHER achieves state-of-the-art performance on link prediction and generalizes well to datasets in different domains and scales. \nOur codes are public available at\nhttps://github.com/yizhilll/TranSHER.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [3.9468207359313965, 7.264837265014648], "id": 2966, "title": "IRRGN: An Implicit Relational Reasoning Graph Network for Multi-turn Response Selection", "authors": "Jingcheng Deng, Hengwei Dai, Xuewei Guo, Yuanchen Ju and Wei Peng", "abstract": "The task of response selection in multi-turn dialogue is to find the best option from all candidates. In order to improve the reasoning ability of the model, previous studies pay more attention to using explicit algorithms to model the dependencies between utterances, which are deterministic, limited and inflexible. In addition, few studies consider differences between the options before and after reasoning. In this paper, we propose an Implicit Relational Reasoning Graph Network to address these issues, which consists of the Utterance Relational Reasoner (URR) and the Option Dual Comparator (ODC). URR aims to implicitly extract dependencies between utterances, as well as utterances and options, and make reasoning with relational graph convolutional networks. ODC focuses on perceiving the difference between the options through dual comparison, which can eliminate the interference of the noise options. Experimental results on two multi-turn dialogue reasoning benchmark datasets MuTual and MuTualplus show that our method significantly improves the baseline of four pre-trained language models and achieves state-of-the-art performance. The model surpasses human performance for the first time on the MuTual dataset.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [0.626977264881134, 6.9304022789001465], "id": 2997, "title": "Predicting Prerequisite Relations for Unseen Concepts", "authors": "Yaxin Zhu and Hamed Zamani", "abstract": "Concept prerequisite learning (CPL) plays a key role in developing technologies that assist people to learn a new complex topic or concept. Previous work commonly assumes that all concepts are given at training time and solely focuses on predicting the unseen prerequisite relationships between them. However, many real-world scenarios deal with concepts that are left undiscovered at training time, which is relatively unexplored. This paper studies this problem and proposes a novel alternating knowledge distillation approach to take advantage of both content- and graph-based models for this task. Extensive experiments on three public benchmarks demonstrate up to 10% improvements in terms of F1 score.", "track": "Information Extraction", "label": 5}, {"loc": [5.374624729156494, 12.481864929199219], "id": 3013, "title": "Contrastive Learning with Expectation-Maximization for Weakly Supervised Phrase Grounding", "authors": "Keqin Chen, Richong Zhang, Samuel Mensah and Yongyi Mao", "abstract": "Weakly supervised phrase grounding aims to learn an alignment between phrases in a caption and objects in a corresponding image using only caption-image annotations, i.e., without phrase-object annotations. Previous methods typically use a caption-image contrastive loss to indirectly supervise the alignment between phrases and objects, which hinders the maximum use of the intrinsic structure of the multimodal data and leads to unsatisfactory performance. In this work, we directly use the phrase-object contrastive loss in the condition that no positive annotation is available in the first place. Specifically, we propose a novel contrastive learning framework based on the expectation-maximization algorithm that adaptively refines the target prediction. Experiments on two widely used benchmarks, Flickr30K Entities and RefCOCO+, demonstrate the effectiveness of our framework. We obtain 63.05% top-1 accuracy on Flickr30K Entities and 59.51%/43.46% on RefCOCO+ TestA/TestB, outperforming the previous methods by a large margin, even surpassing a previous SoTA that uses a pre-trained vision-language model. Furthermore, we deliver a theoretical analysis of the effectiveness of our method from the perspective of the maximum likelihood estimate with latent variables.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [8.06466293334961, 9.64835262298584], "id": 3019, "title": "Beyond prompting: Making Pre-trained Language Models Better Zero-shot Learners by Clustering Representations", "authors": "Yu Fei, Zhao Meng, Ping Nie, Roger Wattenhofer and Mrinmaya Sachan", "abstract": "Recent work has demonstrated that pre-trained language models (PLMs) are zero-shot learners. However, most existing zero-shot methods involve heavy human engineering or complicated self-training pipelines, hindering their application to new situations. In this work, we show that zero-shot text classification can be improved simply by clustering texts in the embedding spaces of PLMs. Specifically, we fit the unlabeled texts with a Bayesian Gaussian Mixture Model after initializing cluster positions and shapes using class names. Despite its simplicity, this approach achieves superior or comparable performance on both topic and sentiment classification datasets and outperforms prior works significantly on unbalanced datasets. We further explore the applicability of our clustering approach by evaluating it on 14 datasets with more diverse topics, text lengths, and numbers of classes. Our approach achieves an average of 20% absolute improvement over prompt-based zero-shot learning. Finally, we compare different PLM embedding spaces and find that texts are well-clustered by topics even if the PLM is not explicitly pre-trained to generate meaningful sentence embeddings. This work indicates that PLM embeddings can categorize texts without task-specific fine-tuning, thus providing a new way to analyze and utilize their knowledge and zero-shot learning ability.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [2.3667678833007812, 8.667073249816895], "id": 3021, "title": "Generalizing over Long Tail Concepts for Medical Term Normalization", "authors": "Beatrice Portelli, Simone Scaboro, Enrico Santus, Hooman Sedghamiz, Emmanuele Chersoni and Giuseppe Serra", "abstract": "Medical term normalization consists in mapping a piece of text to a large number of output classes.\nGiven the small size of the annotated datasets and the extremely long tail distribution of the concepts, it is of utmost importance to develop models that are capable to generalize to scarce or unseen concepts.\nAn important attribute of most target ontologies is their hierarchical structure. In this paper we introduce a simple and effective learning strategy that leverages such information to enhance the generalizability of both discriminative and generative models.\nThe evaluation shows that the proposed strategy produces state-of-the-art performance on seen concepts and consistent improvements on unseen ones, allowing also for efficient zero-shot knowledge transfer across text typologies and datasets.", "track": "NLP Applications", "label": 0}, {"loc": [3.6964547634124756, 9.903865814208984], "id": 3028, "title": "Unsupervised Opinion Summarisation in the Wasserstein Space", "authors": "Jiayu Song, Iman Munire Bilal, Adam Tsakalidis, Rob Procter and Maria Liakata", "abstract": "Opinion summarisation synthesises opinions expressed in a group of documents discussing\nthe same topic to produce a single summary. Recent work has looked at opinion summarisation of clusters of social media posts. Such posts are noisy and have unpredictable structure, posing additional challenges for the construction of the summary distribution and the preservation of meaning compared to online reviews, which has been so far the focus on opinion summarisation. To address these \nchallenges we present WassOS, an unsupervised abstractive summarization model which makes\nuse of the Wasserstein distance. A Variational Autoencoder is first used to obtain the distribution of documents/posts, and the summary distribution is obtained as the Wasserstein barycenter. We create separate disentangled latent semantic and syntactic representations of the summary, which are fed into a GRU decoder with a transformer layer to produce the final summary. Our experiments on\nmultiple datasets including reviews, Twitter clusters and Reddit threads show that WassOS\nalmost always outperforms the state-of-the-art on ROUGE metrics and consistently produces\nthe best summaries with respect to meaning preservation according to human evaluations.", "track": "Summarization", "label": 14}, {"loc": [6.057355880737305, 12.322936058044434], "id": 3033, "title": "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", "authors": "Colin Leong, Joshua Nemecek, Jacob Mansdorfer, Anna Filighera, Abraham Owodunni and Daniel Whitenack", "abstract": "We present Bloom Library, a linguistically diverse set of multimodal and multilingual datasets for language modeling, image captioning, visual storytelling, and speech synthesis/recognition. These datasets represent either the most, or among the most, multilingual datasets for each of the included downstream tasks. In total, the initial release of the Bloom Library datasets covers 363 languages across 32 language families. We train downstream task models for various languages represented in the data, showing the viability of the data for future work in low-resource, multimodal NLP and establishing the first known baselines for these downstream tasks in certain languages (e.g., Bisu [bzi], with an estimated population of 700 users). Some of these first-of-their-kind baselines are comparable to state-of-the-art performance for higher-resourced languages. The Bloom Library datasets are released under Creative Commons licenses on the Hugging Face datasets hub to catalyze more linguistically diverse research in the included downstream tasks.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.709479331970215, 6.9167304039001465], "id": 3045, "title": "Disentangling Uncertainty in Machine Translation Evaluation", "authors": "Chrysoula Zerva, Taisiya Glushkova, Ricardo Rei and Andr\u00e9 F. T. Martins", "abstract": "Trainable evaluation metrics for machine translation (MT) exhibit strong correlation with human judgements, but they are often hard to interpret and might produce unreliable scores \nunder noisy or out-of-domain data. \nRecent work has attempted to mitigate this with simple uncertainty quantification techniques (Monte Carlo dropout and deep ensembles), however these techniques (as we show) are limited in several ways \u2013 for example, they are unable to distinguish between different kinds of uncertainty, and they are time and memory consuming. \nIn this paper, we propose more powerful and efficient uncertainty predictors for MT evaluation, and we assess their ability to target different sources of aleatoric and epistemic uncertainty. \nTo this end, we develop and compare training objectives for the COMET metric to enhance it with an uncertainty prediction output, including heteroscedastic regression, divergence minimization, and direct uncertainty prediction.\nOur experiments show improved results on uncertainty prediction for the WMT metrics task datasets, with a substantial reduction in computational costs. Moreover, they demonstrate the ability of these predictors to address specific uncertainty causes in MT evaluation, such as low quality references and \nout-of-domain data.", "track": "Machine Translation", "label": 10}, {"loc": [1.605499029159546, 8.59281063079834], "id": 3048, "title": "Does Your Model Classify Entities Reasonably? Diagnosing and Mitigating Spurious Correlations in Entity Typing", "authors": "Nan Xu, Fei Wang, Bangzheng Li, Mingtao Dong and Muhao Chen", "abstract": "Entity typing aims at predicting one or more words that describe the type(s) of a specific mention in a sentence. Due to shortcuts from surface patterns to annotated entity labels and biased training, existing entity typing models are subject to the problem of spurious correlations. To comprehensively investigate the faithfulness and reliability of entity typing methods, we first systematically define distinct kinds of model biases that are reflected mainly from spurious correlations. Particularly, we identify six types of existing model biases, including mention-context bias, lexical overlapping bias, named entity bias, pronoun bias, dependency bias, and overgeneralization bias. To mitigate model biases, we then introduce a counterfactual data augmentation method. By augmenting the original training set with their debiased\ncounterparts, models are forced to fully comprehend sentences and discover the fundamental cues for entity typing, rather than relying on spurious correlations for shortcuts. Experimental results on the UFET dataset show our counterfactual data augmentation approach helps improve generalization of different entity typing models with consistently better performance on both the original and debiased test sets.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.628932237625122, 8.60616397857666], "id": 3049, "title": "EDIN: An End-to-end Benchmark and Pipeline for Unknown Entity Discovery and Indexing", "authors": "Nora Kassner, Fabio Petroni, Mikhail Plekhanov, Sebastian Riedel and Nicola Cancedda", "abstract": "Existing work on Entity Linking mostly assumes that the reference knowledge base is complete, and therefore all mentions can be linked. In practice this is hardly ever the case, as knowledge bases are incomplete and because novel concepts arise constantly. We introduce the temporally segmented Unknown Entity Discovery and Indexing (EDIN)-benchmark where unknown entities, that is entities not part of the knowledge base and without descriptions and labeled mentions, have to be integrated into an existing entity linking system. By contrasting EDIN with zero-shot entity linking, we provide insight on the additional challenges it poses. Building on dense-retrieval based entity linking, we introduce the end-to-end EDIN-pipeline that detects, clusters, and indexes mentions of unknown entities in context. Experiments show that indexing a single embedding per entity unifying the information of multiple mentions works better than indexing mentions independently.", "track": "Information Extraction", "label": 5}, {"loc": [2.472520351409912, 7.201533317565918], "id": 3059, "title": "POQue: Asking Participant-specific Outcome Questions for a Deeper Understanding of Complex Events", "authors": "Sai P. Vallurupalli, Sayontan Ghosh, Katrin Erk, Niranjan Balasubramanian and Francis Ferraro", "abstract": "Knowledge about outcomes is critical for complex event understanding but is hard to acquire.\nWe show that by pre-identifying a participant in a complex event, crowdworkers are able\nto (1) infer the collective impact of salient events that make up the situation, (2) annotate the volitional engagement of participants in causing the situation, and (3) ground the\noutcome of the situation in state changes of the participants. By creating a multi-step interface and a careful quality control strategy, we collect a high quality annotated dataset of\n8K short newswire narratives and ROCStories with high inter-annotator agreement (0.74-0.96\nweighted Fleiss Kappa). Our dataset, POQUe (Participant Outcome Questions), enables the\nexploration and development of models that address multiple aspects of semantic understanding. Experimentally, we show that current language models lag behind human performance in subtle ways through our task formulations that target abstract and specific comprehension of a complex event, its outcome, and a participant's influence over the event culmination.", "track": "Ethic Concerns:Resources and Evaluation", "label": 1}, {"loc": [5.481792449951172, 5.1186113357543945], "id": 3060, "title": "Measuring the Mixing of Contextual Information in the Transformer", "authors": "Javier Ferrando, Gerard I. G\u00e1llego and Marta R. Costa-juss", "abstract": "The Transformer architecture aggregates input information through the self-attention mechanism, but there is no clear understanding of how this information is mixed across the entire model. Additionally, recent works have demonstrated that attention weights alone are not enough to describe the flow of information. In this paper, we consider the whole attention block --multi-head attention, residual connection, and layer normalization-- and define a metric to measure token-to-token interactions within each layer. Then, we aggregate layer-wise interpretations to provide input attribution scores for model predictions. Experimentally, we show that our method, ALTI (Aggregation of Layer-wise Token-to-token Interactions), provides more faithful explanations and increased robustness than gradient-based methods.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.255373001098633, 6.907346248626709], "id": 3070, "title": "Dealing with Abbreviations in the Slovenian Biographical Lexicon", "authors": "Angel Daza, Antske Fokkens and Toma\u017e Erjavec", "abstract": "Abbreviations present a significant challenge for NLP systems because they cause tokenization and out-of-vocabulary errors. They can also make the text less readable, especially in reference printed books, where they are extensively used. Abbreviations are especially problematic in low-resource settings, where systems are less robust to begin with. In this paper, we propose a new method for addressing the problems caused by a high density of domain-specific abbreviations in a text. We apply this method to the case of a Slovenian biographical lexicon and evaluate it on a newly developed gold-standard dataset of 51 Slovenian biographies. Our abbreviation identification method performs significantly better than commonly used ad-hoc solutions, especially at identifying unseen abbreviations. We also propose and present the results of a method for expanding the identified abbreviations in context.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [2.11199688911438, 4.051860809326172], "id": 3071, "title": "AfriCLIRMatrix: Enabling Cross-Lingual Information Retrieval for African Languages", "authors": "Odunayo Jude Ogundepo, Xinyu Zhang, Shuo Sun, Kevin Duh and Jimmy Lin", "abstract": "Language diversity in NLP is critical in enabling the development of tools for a wide range of users.\nHowever, there are limited resources for building such tools for many languages, particularly those spoken in Africa.\nFor search, most existing datasets feature few or no African languages, directly impacting researchers' ability to build and improve information access capabilities in those languages.\nMotivated by this, we created AfriCLIRMatrix, a test collection for cross-lingual information retrieval research in 15 diverse African languages.\nIn total, our dataset contains 6 million queries in English and 23 million relevance judgments automatically mined from Wikipedia inter-language links, covering many more African languages than any existing information retrieval test collection.\nIn addition, we release BM25, dense retrieval, and sparse--dense hybrid baselines to provide a starting point for the development of future systems.\nWe hope that these efforts can spur additional work in search for African languages.\nAfriCLIRMatrix can be downloaded at https://github.com/castorini/africlirmatrix.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.8146071434020996, 4.76609468460083], "id": 3072, "title": "CONDAQA: A Contrastive Reading Comprehension Dataset for Reasoning about Negation", "authors": "Abhilasha Ravichander, Matt Gardner and Ana Marasovic", "abstract": "The full power of human language-based communication cannot be realized without negation. All human languages have some form of negation. Despite this, negation remains a challenging phenomenon for current natural language understanding systems. To facilitate the future development of models that can process negation effectively, we present CONDAQA, the first English reading comprehension dataset which requires reasoning about the implications of negated statements in paragraphs. We collect paragraphs with diverse negation cues, then have crowdworkers ask questions about the implications of the negated statement in the passage. We also have workers make three kinds of edits to the passage---paraphrasing the negated statement, changing the scope of the negation, and reversing the negation---resulting in clusters of question-answer pairs that are difficult for models to answer with spurious shortcuts. CONDAQA features 14,182 question-answer pairs with over 200 unique negation cues and is challenging for current state-of-the-art models. We release our dataset, along with both supervised and few-shot evaluations, to facilitate the development of future NLP methods that work on negated language.", "track": "Theme Track", "label": 18}, {"loc": [10.605592727661133, 7.075927257537842], "id": 3075, "title": "Towards Opening the Black Box of Neural Machine Translation: Source and Target Interpretations of the Transformer", "authors": "Javier Ferrando, Gerard I. G\u00e1llego, Belen Alastruey, Carlos Escolano and Marta R. Costa-juss", "abstract": "In Neural Machine Translation (NMT), each token prediction is conditioned on the source sentence and the target prefix (what has been previously translated at a decoding step). However, previous work on interpretability in NMT has mainly focused solely on source sentence tokens' attributions. Therefore, we lack a full understanding of the influences of every input token (source sentence and target prefix) in the model predictions. In this work, we propose an interpretability method that tracks input tokens' attributions for both contexts. Our method, which can be extended to any encoder-decoder Transformer-based model, allows us to better comprehend the inner workings of current NMT models. We apply the proposed method to both bilingual and multilingual Transformers and present insights into their behaviour.", "track": "Machine Translation", "label": 10}, {"loc": [6.119662284851074, 12.33460521697998], "id": 3101, "title": "ArtELingo: A Million Emotion Annotations of WikiArt with Emphasis on Diversity over Language and Culture", "authors": "Youssef Sherif Mansour Mohamed, Mohamed Abdelfattah, Shyma Yaser Alhuwaider, Feifan Li, Xiangliang Zhang, Kenneth Ward Church and Mohamed Elhoseiny", "abstract": "This paper introduces ArtELingo, a new benchmark and dataset, designed to encourage work on diversity across languages and cultures. Following ArtEmis, a collection of 80k artworks from WikiArt with 0.45M emotion labels and English-only captions, ArtELingo adds another 0.79M annotations in Arabic and Chinese, plus 4.8K in Spanish to evaluate \"cultural-transfer\u201d performance. 51K artworks have 5 annotations or more in 3 languages. This diversity makes it possible to study similarities and differences across languages and cultures. Further, we investigate captioning tasks, and find diversity improves the performance of baseline models. ArtELingo is publicly available at `www.artelingo.org` with standard splits and baseline models. We hope our work will help ease future research on multilinguality and culturally-aware AI.", "track": "Ethic Concerns:Multilinguality", "label": 13}, {"loc": [1.844874382019043, 3.929999589920044], "id": 3119, "title": "Decoding a Neural Retriever's Latent Space for Query Suggestion", "authors": "Leonard Adolphs, Michelle Chen Huebscher, Christian Buck, Sertan Girgin, Olivier Bachem, Massimiliano Ciaramita and Thomas Hofmann", "abstract": "Neural retrieval models have superseded classic bag-of-words methods such as BM25 as the retrieval framework of choice. However, neural systems lack the interpretability of bag-of-words models; it is not trivial to connect a query change to a change in the latent space that ultimately determines the retrieval results. To shed light on this embedding space, we learn a \"query decoder\" that, given a latent representation of a neural search engine, generates the corresponding query. We show that it is possible to decode a meaningful query from its latent representation and, when moving in the right direction in latent space, to decode a query that retrieves the relevant paragraph. In particular, the query decoder can be useful to understand \"what should have been asked\" to retrieve a particular paragraph from the collection. We employ the query decoder to generate a large synthetic dataset of query reformulations for MSMarco, leading to improved retrieval performance. On this data, we train a pseudo-relevance feedback (PRF) T5 model for the application of query suggestion that outperforms both query reformulation and PRF information retrieval baselines.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.928468227386475, 9.183683395385742], "id": 3123, "title": "T-STAR: Truthful Style Transfer using AMR Graph as Intermediate Representation", "authors": "Anubhav Jangra, Preksha Nema and Aravindan Raghuveer", "abstract": "Unavailability of parallel corpora for training text style transfer (TST) models is a very challenging yet common scenario. Also, TST models implicitly need to preserve the content while transforming a source sentence into the target style. To tackle these problems, an intermediate representation is often constructed that is devoid of style while still preserving the meaning of the source sentence. In this work, we study the usefulness of Abstract Meaning Representation (AMR) graph as the intermediate style agnostic representation. We posit that semantic notations like AMR are a natural choice for an intermediate representation. Hence, we propose T-STAR: a model comprising of two components, text-to-AMR encoder and a AMR-to-text decoder. We propose several modeling improvements to enhance the style agnosticity of the generated AMR. To the best of our knowledge, T-STAR is the first work that uses AMR as an intermediate representation for TST. With thorough experimental evaluation we show T-STAR significantly outperforms state of the art techniques by achieving on an average 15.2% higher content preservation with negligible loss (~3%) in style accuracy. Through detailed human evaluation with 90,000 ratings, we also show that T-STAR has upto 50% lesser hallucinations compared to state of the art TST models.", "track": "Natural Language Generation", "label": 6}, {"loc": [8.046353340148926, 5.7361249923706055], "id": 3128, "title": "PromptBERT: Improving BERT Sentence Embeddings with Prompts", "authors": "Ting Jiang, Jian Jiao, Shaohan Huang, Zihan Zhang, deqing wang, Fuzhen Zhuang, Furu Wei, Haizhen Huang, Denvy Deng and Qi Zhang", "abstract": "We propose PromptBERT, a novel contrastive learning method for learning better sentence representation. We firstly analysis the drawback of current sentence embedding from original BERT and find that it is mainly due to the static token embedding bias and ineffective BERT layers. Then we propose the first prompt-based sentence embeddings method and discuss two prompt representing methods and three prompt searching methods to make BERT achieve better sentence embeddings .Moreover, we propose a novel unsupervised training objective by the technology of template denoising, which substantially shortens the performance gap between the supervised and unsupervised settings. Extensive experiments show the effectiveness of our method. Compared to SimCSE, PromptBert achieves 2.29 and 2.58 points of improvement based on BERT and RoBERTa in the unsupervised setting.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [5.469510078430176, 5.113385200500488], "id": 3132, "title": "Extending Logic Explained Networks to Text Classification", "authors": "Rishabh Jain, Gabriele Ciravegna, Pietro Barbiero, Francesco Giannini, Davide Buffelli and Pietro Lio", "abstract": "Recently, Logic Explained Networks (LENs) have been proposed as explainable-by-design neural models providing logic explanations for their predictions.\nHowever, these models have only been applied to vision and tabular data, and they mostly favour the generation of global explanations, while local ones tend to be noisy and verbose.\nFor these reasons, we propose LENp, improving local explanations by perturbing input words, and we test it on text classification. Our results show that (i) LENp provides better local explanations than LIME in terms of sensitivity and faithfulness, and (ii) its logic explanations are more useful and user-friendly than the feature scoring provided by LIME as attested by a human survey.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.673538088798523, 5.361381530761719], "id": 3137, "title": "Uni-Parser: Unified Semantic Parser for Question Answering on Knowledge Base and Database", "authors": "Ye Liu, Semih Yavuz, Rui Meng, Dragomir Radev, caiming xiong and Yingbo Zhou", "abstract": "Parsing natural language questions into executable logical forms is a useful and interpretable way to perform question answering on structured data such as knowledge bases (KB) or databases (DB). \nHowever, existing approaches on semantic parsing cannot adapt to both modalities, as they suffer from the exponential growth of the logical form candidates and can hardly generalize to unseen data.\nIn this work, we propose Uni-Parser, a unified semantic parser for question answering (QA) on both KB and DB. We define the primitive (relation and entity in KB, and table name, column name and cell value in DB) as the essential element in our framework. The number of primitives grows only at a linear rate to the number of retrieved relations in KB and DB, preventing us from exponential logic form candidates. \nWe leverage the generator to predict final logical forms by altering and composing top-ranked primitives with different operations (e.g. select, where, count). With sufficiently pruned search space by a contrastive primitive ranker, the generator is empowered to capture the composition of primitives enhancing its generalization ability. We achieve competitive results on multiple KB and DB QA benchmarks with more efficiency, especially in the compositional and zero-shot settings.", "track": "Question Answering", "label": 11}, {"loc": [9.369523048400879, 6.256190776824951], "id": 3138, "title": "RAPO: An Adaptive Ranking Paradigm for Bilingual Lexicon Induction", "authors": "Zhoujin Tian, Chaozhuo Li, Shuo Ren, Zhiqiang Zuo, Zengxuan Wen, Xinyue Hu, Xiao Han, Haizhen Huang, Denvy Deng, Qi Zhang and Xing Xie", "abstract": "Bilingual lexicon induction induces the word translations by aligning independently trained word embeddings in two languages. Existing approaches generally focus on minimizing the distances between words in the aligned pairs, while suffering from low discriminative capability to distinguish the relative orders between positive and negative candidates. In addition, the mapping function is globally shared by all words, whose performance might be hindered by the deviations in the distributions of different languages. In this work, we propose a novel ranking-oriented induction model RAPO to learn personalized mapping function for each word. RAPO is capable of enjoying the merits from the unique characteristics of a single word and the cross-language isomorphism simultaneously. Extensive experimental results on public datasets including both rich-resource and low-resource languages demonstrate the superiority of our proposal. Our code is publicly available in \\url{https://github.com/Jlfj345wf/RAPO}.", "track": "Machine Translation", "label": 10}, {"loc": [8.40419864654541, 6.5087056159973145], "id": 3146, "title": "On Parsing as Tagging", "authors": "Afra Amini and Ryan Cotterell", "abstract": "There are many proposals to reduce constituency parsing to tagging. To figure out what these approaches have in common, we offer a unifying pipeline, which consists of three steps: linearization, learning, and decoding. We prove that classic shift\u2013reduce parsing can be reduced to tetratagging---the state-of-the-art constituency tagger---under two assumptions: right-corner transformation in the linearization step and factored scoring in the learning step. We ask what is the most critical factor that makes parsing-as-tagging methods accurate while being efficient. To answer this question, we empirically evaluate a taxonomy of tagging pipelines with different choices of linearizers, learners, and decoders. Based on the results in English as well as a set of 8 typologically diverse languages, we conclude that the linearization of the derivation tree and its alignment with the input sequence is the most critical factor in achieving accurate parsers as taggers.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [5.4511332511901855, 12.183195114135742], "id": 3147, "title": "Distilled Dual-Encoder Model for Vision-Language Understanding", "authors": "Zekun Wang, Wenhui Wang, Haichao Zhu, ming liu, Bing Qin and Furu Wei", "abstract": "On vision-language understanding (VLU) tasks, fusion-encoder vision-language models achieve superior results but sacrifice efficiency because of the simultaneous encoding of images and text. On the contrary, the dual encoder model that separately encodes images and text has the advantage in efficiency, while failing on VLU tasks due to the lack of deep cross-modal interactions. To get the best of both worlds, we propose DiDE, a framework that distills the knowledge of the fusion-encoder teacher model into the dual-encoder student model. Since the cross-modal interaction is the key to the superior performance of teacher model but is absent in the student model, we encourage the student not only to mimic the predictions of teacher, but also to calculate the cross-modal attention distributions and align with the teacher. Experimental results demonstrate that DiDE is competitive with the fusion-encoder teacher model in performance (only a 1% drop) while enjoying 4 times faster inference. Further analyses reveal that the proposed cross-modal attention distillation is crucial to the success of our framework.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [1.3589158058166504, 10.333742141723633], "id": 3152, "title": "Argument Mining for Review Helpfulness Prediction", "authors": "Zaiqian Chen, Daniel Verdi do Amarante, Jenna Donaldson, Yohan Jo and Joonsuk Park", "abstract": "The importance of reliably determining the helpfulness of product reviews is rising as both helpful and unhelpful reviews continue to accumulate on e-commerce websites. And argumentational features---such as the structure of arguments and the types of underlying elementary units---have shown to be promising indicators of product review helpfulness. However, their adoption has been limited due to the lack of sufficient resources and large-scale experiments investigating their utility. To this end, we present the AMazon Argument Mining (AM$^2$) corpus---a corpus of 878 Amazon reviews on headphones annotated according to a theoretical argumentation model designed to evaluate argument quality.\nExperiments show that employing argumentational features leads to statistically significant improvements over the state-of-the-art review helpfulness predictors under both text-only and text-and-image settings.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [7.2147088050842285, 9.769562721252441], "id": 3172, "title": "Hierarchical Multi-Label Classification of Scientific Documents", "authors": "Mobashir Sadat and Cornelia Caragea", "abstract": "Automatic topic classification has been studied extensively to assist managing and indexing scientific documents in a digital collection. With the large number of topics being available in recent years, it has become necessary to arrange them in a hierarchy. Therefore, the automatic classification systems need to be able to classify the documents hierarchically. In addition, each paper is often assigned to more than one relevant topic. For example, a paper can be assigned to several topics in a hierarchy tree. In this paper, we introduce a new dataset for hierarchical multi-label text classification (HMLTC) of scientific papers called SciHTC, which contains 186,160 papers and 1,234 categories from the ACM CCS tree. We establish strong baselines for HMLTC and propose a multi-task learning approach for topic classification with keyword labeling as an auxiliary task. Our best model achieves a Macro-F1 score of 34.57% which shows that this dataset provides significant research opportunities on hierarchical scientific topic classification. We make our dataset and code for all experiments publicly available.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.765139579772949, 3.3717100620269775], "id": 3182, "title": "Rainier: Reinforced Knowledge Introspector for Commonsense Question Answering", "authors": "Jiacheng Liu, Skyler R. Hallinan, Ximing Lu, Pengfei He, Sean Welleck, Hannaneh Hajishirzi and Yejin Choi", "abstract": "Knowledge underpins reasoning. Recent research demonstrates that when relevant knowledge is provided as additional context to commonsense question answering (QA), it can substantially enhance the performance even on top of state-of-the-art. The fundamental challenge is where and how to find such knowledge that is high quality and on point with respect to the question; knowledge retrieved from knowledge bases are incomplete and knowledge generated from language models are inconsistent.\n\nWe present Rainier, or Reinforced Knowledge Introspector, that learns to generate contextually relevant knowledge in response to given questions. Our approach starts by imitating knowledge generated by GPT-3, then learns to generate its own knowledge via reinforcement learning where rewards are shaped based on the increased performance on the resulting question answering. Rainier demonstrates substantial and consistent performance gains when tested over 9 different commonsense benchmarks: including 5 datasets that are seen during model training, as well as 4 datasets that are kept unseen. Our work is the first to report that knowledge generated by models that are orders of magnitude smaller than GPT-3, even without direct supervision on the knowledge itself, can exceed the quality of commonsense knowledge elicited from GPT-3.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [8.202903747558594, 7.217472076416016], "id": 3187, "title": "A Major Obstacle for NLP Research: Let's Talk about Time Allocation!", "authors": "Katharina Kann, Shiran Dudy and Arya D. McCarthy", "abstract": "The field of natural language processing (NLP) has grown over the last few years: conferences have become larger, we have published an incredible amount of papers, and state-of-the-art research has been implemented in a large variety of customer-facing products. However, this paper argues that we have been less successful than we *should* have been and reflects on where and how the field fails to tap its full potential. \nSpecifically, we demonstrate that, in recent years, **subpar time allocation has been a major obstacle for NLP research**. We outline multiple concrete problems together with their negative consequences and, importantly, suggest remedies to improve the status quo. We hope that this paper will be a starting point for discussions around which common practices are -- or are *not* -- beneficial for NLP research.", "track": "Theme Track", "label": 18}, {"loc": [4.650156497955322, 9.133790969848633], "id": 3190, "title": "Towards Inter-character Relationship-driven Story Generation", "authors": "Anvesh Rao Vijjini, Faeze Brahman and Snigdha Chaturvedi", "abstract": "In this paper, we introduce the task of modeling interpersonal relationships for story generation. For addressing this task, we propose Relationships as Latent Variables for Story Generation, (ReLiSt). ReLiSt generates stories sentence by sentence and has two major components - a relationship selector and a story continuer. The relationship selector specifies a latent variable to pick the relationship to exhibit in the next sentence and the story continuer generates the next sentence while expressing the selected relationship in a coherent way. Our automatic and human evaluations demonstrate that ReLiSt is able to generate stories with relationships that are more faithful to desired relationships while maintaining the content quality. The relationship assignments to sentences during inference brings interpretability to ReLiSt.", "track": "Natural Language Generation", "label": 6}, {"loc": [1.8694096803665161, 3.9563591480255127], "id": 3201, "title": "Incorporating Relevance Feedback for Information-Seeking Retrieval using Few-Shot Document Re-Ranking", "authors": "Tim Baumg\u00e4rtner, Leonardo F. R. Ribeiro, Nils Reimers and Iryna Gurevych", "abstract": "Pairing a lexical retriever with a neural re-ranking model has set state-of-the-art performance on large-scale information retrieval datasets. This pipeline covers scenarios like question answering or navigational queries, however, for information-seeking scenarios, users often provide information on whether a document is relevant to their query in form of clicks or explicit feedback. Therefore, in this work, we explore how relevance feedback can be directly integrated into neural re-ranking models by adopting few-shot and parameter-efficient learning techniques. Specifically, we introduce a kNN approach that re-ranks documents based on their similarity with the query and the documents the user considers relevant. Further, we explore Cross-Encoder models that we pre-train using meta-learning and subsequently fine-tune for each query, training only on the feedback documents. To evaluate our different integration strategies, we transform four existing information retrieval datasets into the relevance feedback scenario. Extensive experiments demonstrate that integrating relevance feedback directly in neural re-ranking models improves their performance, and fusing lexical ranking with our best performing neural re-ranker outperforms all other methods by 5.2% nDCG@20.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [4.315304279327393, 4.844986915588379], "id": 3202, "title": "ReasTAP: Injecting Table Reasoning Skills During Pre-training via Synthetic Reasoning Examples", "authors": "Yilun Zhao, Linyong Nan, Zhenting Qi, Rui Zhang and Dragomir Radev", "abstract": "Reasoning over tabular data requires both table structure understanding and a broad set of table reasoning skills. Current models with table-specific architectures and pre-training methods perform well on understanding table structures, but they still struggle with tasks that require various table reasoning skills. In this work, we develop ReasTAP to show that high-level table reasoning skills can be injected into models during pre-training without a complex table-specific architecture design. We define 7 table reasoning skills, such as numerical operation, temporal comparison, and conjunction. Each reasoning skill is associated with one example generator, which synthesizes questions over semi-structured tables according to the sampled templates. We model the table pre-training task as a sequence generation task and pre-train ReasTAP to generate precise answers of the synthetic examples. ReasTAP is evaluated on four benchmarks covering three downstream tasks including 1) WikiSQL-Weak and WikiTQ for Table Question Answering, 2) TabFact for Table Fact Verification, and 3) LogicNLG for Faithful Table-to-Text Generation. Experimental results demonstrate that ReasTAP achieves new state-of-the-art results on all of them and delivers a significant improvement under low-resource setting. Our code is publicly available at https://github.com/Yale-LILY/ReasTAP.", "track": "Question Answering", "label": 11}, {"loc": [9.20965576171875, 7.070419788360596], "id": 3205, "title": "Few-shot Learning with Multilingual Generative Language Models", "authors": "Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov and Xian Li", "abstract": "Large-scale generative language models such as GPT-3 are competitive few-shot learners. While these models are known to be able to jointly represent many different languages, their training data is dominated by English, potentially limiting their cross-lingual generalization. In this work, we train multilingual generative language models on a corpus covering a diverse set of languages, and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters sets new state of the art in few-shot learning in more than 20 representative languages, outperforming GPT-3 of comparable size in multilingual commonsense reasoning (with +7.4% absolute accuracy improvement in 0-shot settings and +9.4% in 4-shot settings) and natural language inference (+5.4% in each of 0-shot and 4-shot settings). On the FLORES-101 machine translation benchmark, our model outperforms GPT-3 on 171 out of 182 directions with 32 training examples, while surpassing the official supervised baseline in 45 directions. We conduct an in-depth analysis of different multilingual prompting approaches, showing in particular that strong few-shot learning performance across languages can be achieved via cross-lingual transfer through both templates and demonstration examples.", "track": "Multilinguality", "label": 13}, {"loc": [6.868297576904297, 6.257011890411377], "id": 3219, "title": "Are representations built from the ground up? An empirical examination of local composition in language models", "authors": "Emmy Liu and Graham Neubig", "abstract": "Compositionality, the phenomenon where the meaning of a phrase can be derived from its constituent parts, is a hallmark of human language. At the same time, many phrases are non-compositional, carrying a meaning beyond that of each part in isolation. Representing both of these types of phrases is critical for language understanding, but it is an open question whether modern language models (LMs) learn to do so; in this work we examine this question. We first formulate a problem of predicting the LM-internal representations of longer phrases given those of their constituents. We find that the representation of a parent phrase can be predicted with some accuracy given an affine transformation of its children. While we would expect the predictive accuracy to correlate with human judgments of semantic compositionality, we find this is largely not the case, indicating that LMs may not accurately distinguish between compositional and non-compositional phrases. We perform a variety of analyses, shedding light on when different varieties of LMs do and do not generate compositional representations, and discuss implications for future modeling work.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.198393821716309, 9.317082405090332], "id": 3224, "title": "Detecting Label Errors by Using Pre-Trained Language Models", "authors": "Derek Chong, Jenny Hong and Christopher D. Manning", "abstract": "We show that large pre-trained language models are inherently highly capable of identifying label errors in natural language datasets: simply examining out-of-sample data points in descending order of fine-tuned task loss significantly outperforms more complex error-detection mechanisms proposed in previous work. \nTo this end, we contribute a novel method for introducing realistic, human-originated label noise into existing crowdsourced datasets such as SNLI and TweetNLP. \nWe show that this noise has similar properties to real, hand-verified label errors, and is harder to detect than existing synthetic noise, creating challenges for model robustness.\nWe argue that human-originated noise is a better standard for evaluation than synthetic noise. Finally, we use crowdsourced verification to evaluate the detection of real errors on IMDB, Amazon Reviews, and Recon, and confirm that pre-trained models perform at a 9--36% higher absolute Area Under the Precision-Recall Curve than existing models.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.996356964111328, 8.176201820373535], "id": 3236, "title": "Intriguing Properties of Compression on Multilingual Models", "authors": "Kelechi Ogueji, Orevaoghene Ahia, Gbemileke A. Onilude, Sebastian Gehrmann, Sara Hooker and Julia Kreutzer", "abstract": "Multilingual models are often particularly dependent on scaling to generalize to a growing number of languages. Compression techniques are widely relied upon to reconcile the growth in model size with real world resource constraints, but compression can have a disparate effect on model performance for low-resource languages. It is thus crucial to understand the trade-offs between scale, multilingualism, and compression. In this work, we propose an experimental framework to characterize the impact of sparsifying multilingual pre-trained language models during fine-tuning.\nApplying this framework to mBERT named entity recognition models across 40 languages, we find that compression confers several intriguing and previously unknown generalization properties. In contrast to prior findings, we find that compression may improve model robustness over dense models. We additionally observe that under certain sparsification regimes compression may aid, rather than disproportionately impact the performance of low-resource languages.", "track": "Multilinguality", "label": 13}, {"loc": [4.13213586807251, 9.340680122375488], "id": 3242, "title": "Sequence Models for Document Structure Identification in an Undeciphered Script", "authors": "Logan Born, M. Monroe, Kathryn Kelley and Anoop Sarkar", "abstract": "This work describes the first thorough analysis of \"header\u201d signs in proto-Elamite, an undeciphered script from 3100-2900 BCE. Headers are a category of signs which have been provisionally identified through painstaking manual analysis of this script by domain experts. We use unsupervised neural and statistical sequence modeling techniques to provide new and independent evidence for the existence of headers, without supervision from domain experts. Having affirmed the existence of headers as a legitimate structural feature, we next arrive at a richer understanding of their possible meaning and purpose by (i) examining which features predict their presence; (ii) identifying correlations between these features and other document properties; and (iii) examining cases where these features predict the presence of a header in texts where domain experts do not expect one (or vice versa). We provide more concrete processes for labeling headers in this corpus and a clearer justification for existing intuitions about document structure in proto-Elamite.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [8.656522750854492, 5.997016906738281], "id": 3254, "title": "English Contrastive Learning Can Learn Universal Cross-lingual Sentence Embeddings", "authors": "Yaushian Wang, Ashley Wu and Graham Neubig", "abstract": "Universal cross-lingual sentence embeddings map semantically similar cross-lingual sentences into a shared embedding space. Aligning cross-lingual sentence embeddings usually requires supervised cross-lingual parallel sentences. In this work, we propose mSimCSE, which extends SimCSE to multilingual settings and reveal that contrastive learning on English data can surprisingly learn high-quality universal cross-lingual sentence embeddings without any parallel data.\nIn unsupervised and weakly supervised settings, mSimCSE significantly improves previous sentence embedding methods on cross-lingual retrieval and multilingual STS tasks. The performance of unsupervised mSimCSE is comparable to fully supervised methods in retrieving low-resource languages and multilingual STS.\nThe performance can be further enhanced when cross-lingual NLI data is available.", "track": "Multilinguality", "label": 13}, {"loc": [7.594743728637695, 9.035468101501465], "id": 3260, "title": "Active Example Selection for In-Context Learning", "authors": "Yiming Zhang, Shi Feng and Chenhao Tan", "abstract": "With a handful of demonstration examples, large-scale language models demonstrate strong capability to perform various tasks by in-context learning from these examples, without any fine-tuning. We demonstrate that in-context learning performance can be highly unstable across samples of examples, indicating the idiosyncrasies of how language models acquire information. We formulate example selection for in-context learning as a sequential decision problem, and propose a reinforcement learning algorithm for identifying generalizable policies to select demonstration examples. For GPT-2, our learned policies demonstrate strong abilities of generalizing to unseen tasks in training, with a 5.8% improvement on average. Examples selected from our learned policies can even achieve a small improvement on GPT-3 Ada. However, the improvement diminishes on larger GPT-3 models, suggesting emerging capabilities of large language models.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.7692370414733887, 9.596522331237793], "id": 3262, "title": "Improving Factual Consistency in Summarization with Compression-Based Post-Editing", "authors": "Alex Fabbri, Prafulla Kumar Choubey, Jesse Vig, Chien-Sheng Wu and caiming xiong", "abstract": "State-of-the-art summarization models still struggle to be factually consistent with the input text. A model-agnostic way to address this problem is post-editing the generated summaries. However, existing approaches typically fail to remove entity errors if a suitable input entity replacement is not available or may insert erroneous content. In our work, we focus on removing extrinsic entity errors, or entities not in the source, to improve consistency while retaining the summary's essential information and form. We propose to use sentence-compression data to train the post-editing model to take a summary with extrinsic entity errors marked with special tokens and output a compressed, well-formed summary with those errors removed. We show that this model improves factual consistency while maintaining ROUGE, improving entity precision by up to 30% on XSum, and that this model can be applied on top of another post-editor, improving entity precision by up to a total of 38%. We perform an extensive comparison of post-editing approaches that demonstrate trade-offs between factual consistency, informativeness, and grammaticality, and we analyze settings where post-editors show the largest improvements.", "track": "Summarization", "label": 14}, {"loc": [7.270824909210205, 6.932460784912109], "id": 3272, "title": "Evaluating the Impact of Model Scale for Compositional Generalization in Semantic Parsing", "authors": "Linlu Qiu, Peter Shaw, Panupong Pasupat, Tianze Shi, Jonathan Herzig, Emily Pitler, Fei Sha and Kristina Toutanova", "abstract": "Despite their strong performance on many tasks, pre-trained language models have been shown to struggle on out-of-distribution compositional generalization. Meanwhile, recent work has shown considerable improvements on many NLP tasks from model scaling. Can scaling up model size also improve compositional generalization in semantic parsing? We evaluate encoder-decoder models up to 11B parameters and decoder-only models up to 540B parameters, and compare model scaling curves for three different methods for applying a pre-trained language model to a new task: fine-tuning all parameters, prompt tuning, and in-context learning. We observe that fine-tuning generally has flat or negative scaling curves on out-of-distribution compositional generalization in semantic parsing evaluations. In-context learning has positive scaling curves, but is generally outperformed by much smaller fine-tuned models. Prompt-tuning can outperform fine-tuning, suggesting further potential improvements from scaling as it exhibits a more positive scaling curve. Additionally, we identify several error trends that vary with model scale. For example, larger models are generally better at modeling the syntax of the output space, but are also more prone to certain types of overfitting. Overall, our study highlights limitations of current techniques for effectively leveraging model scale for compositional generalization, while our analysis also suggests promising directions for future work.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.146089553833008, 5.792469024658203], "id": 3277, "title": "\"I'm sorry to hear that\": Finding New Biases in Language Models with a Holistic Descriptor Dataset", "authors": "Eric Michael Smith, Melissa Hall, Melanie Kambadur, Eleonora Presani and Adina Williams", "abstract": "As language models grow in popularity, it becomes increasingly important to clearly measure all possible markers of demographic identity in order to avoid perpetuating existing societal harms. Many datasets for measuring bias currently exist, but they are restricted in their coverage of demographic axes and are commonly used with preset bias tests that presuppose which types of biases models can exhibit. In this work, we present a new, more inclusive bias measurement dataset, HolisticBias, which includes nearly 600 descriptor terms across 13 different demographic axes. HolisticBias was assembled in a participatory process including experts and community members with lived experience of these terms. These descriptors combine with a set of bias measurement templates to produce over 450,000 unique sentence prompts, which we use to explore, identify, and reduce novel forms of bias in several generative models. We demonstrate that HolisticBias is effective at measuring previously undetectable biases in token likelihoods from language models, as well as in an offensiveness classifier. We will invite additions and amendments to the dataset, which we hope will serve as a basis for more easy-to-use and standardized methods for evaluating bias in NLP models.", "track": "Ethic Concerns:Ethics", "label": 21}, {"loc": [5.240553379058838, 12.152050018310547], "id": 3286, "title": "Understanding ME? Multimodal Evaluation for Fine-grained Visual Commonsense", "authors": "Zhecan Wang, Haoxuan You, Yicheng He, Wenhao Li, Kai-Wei Chang and Shih-Fu Chang", "abstract": "Visual commonsense understanding requires Vision Language (VL) models to not only understand image and text but also cross-reference in-between to fully integrate and achieve comprehension of the visual scene described. Recently, various approaches have been developed and have achieved high performance on visual commonsense benchmarks. However, it is unclear whether the models really understand the visual scene and underlying commonsense knowledge due to limited evaluation data resources. To provide an in-depth analysis, we present a Multimodal Evaluation (ME) pipeline to automatically generate question-answer pairs to test models' understanding of the visual scene, text, and related knowledge. We then take a step further to show that training with the ME data boosts the model's performance in standard VCR evaluation. Lastly, our in-depth analysis and comparison reveal interesting findings: (1) semantically low-level information can assist the learning of high-level information but not the opposite; (2) visual information is generally under utilization compared with text.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [6.6946797370910645, 9.844232559204102], "id": 3298, "title": "Semantic Novelty Detection and Characterization in Factual Text Involving Named Entities", "authors": "Nianzu Ma, Sahisnu Mazumder, Alexander Politowicz, Bing Liu, Eric Robertson and Scott Grigsby", "abstract": "Much of the existing work on text novelty detection has been studied at the topic level, i.e., identifying whether the topic of a document or a sentence is novel or not. Little work has been done at the fine-grained semantic level (or contextual level). For example, given that we know Elon Musk is the CEO of a technology company, the sentence \"Elon Musk acted in the sitcom The Big Bang Theory'' is novel and surprising because normally a CEO would not be an actor. Existing topic-based novelty detection methods work poorly on this problem because they do not perform semantic reasoning involving relations between named entities in the text and their background knowledge. This paper proposes an effective model (called PAT-SND) to solve the problem, which can also characterize the novelty. An annotated dataset is also created. Evaluation shows that PAT-SND outperforms 10 baselines by large margins.", "track": "NLP Applications", "label": 0}, {"loc": [4.888962268829346, 3.3746895790100098], "id": 3304, "title": "CN-AutoMIC: Distilling Chinese Commonsense Knowledge from Pretrained Language Models", "authors": "Chenhao Wang, Jiachun Li, Yubo Chen, Kang Liu and Jun Zhao", "abstract": "Commonsense knowledge graphs (CKGs) are increasingly applied in various natural language processing tasks. However, most existing CKGs are limited to English, which hinders related research in non-English languages. Meanwhile, directly generating commonsense knowledge from pretrained language models has recently received attention, yet it has not been explored in non-English languages. In this paper, we propose a large-scale Chinese CKG generated from multilingual PLMs, named as **CN-AutoMIC**, aiming to fill the research gap of non-English CKGs. To improve the efficiency, we propose generate-by-category strategy to reduce invalid generation. To ensure the filtering quality, we develop cascaded filters to discard low-quality results. To further increase the diversity and density, we introduce a bootstrapping iteration process to reuse generated results. Finally, we conduct detailed analyses on CN-AutoMIC from different aspects. Empirical results show the proposed CKG has high quality and diversity, surpassing the direct translation version of similar English CKGs. We also find some interesting deficiency patterns and differences between relations, which reveal pending problems in commonsense knowledge generation. We share the resources and related models for further study.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.648690700531006, 7.889715194702148], "id": 3306, "title": "Calibrating Student Models for Emotion-related Tasks", "authors": "Mahshid Hosseini and Cornelia Caragea", "abstract": "Knowledge Distillation (KD) is an effective method to transfer knowledge from one network (a.k.a. teacher) to another (a.k.a. student). In this paper, we study KD on the emotion-related tasks from a new perspective: calibration. We further explore the impact of the mixup data augmentation technique on the distillation objective and propose to use a simple yet effective mixup method informed by training dynamics for calibrating the student models. Underpinned by the regularization impact of the mixup process by providing better training signals to the student models using training dynamics, our proposed mixup strategy gradually enhances the student model's calibration while effectively improving its performance. We evaluate the calibration of pre-trained language models through knowledge distillation over three tasks of emotion detection, sentiment analysis, and empathy detection. By conducting extensive experiments on different datasets, with both in-domain and out-of-domain test sets, we demonstrate that student models distilled from teacher models trained using our proposed mixup method obtained the lowest Expected Calibration Errors (ECEs) and best performance on both in-domain and out-of-domain test sets.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [9.256805419921875, 7.054654598236084], "id": 3313, "title": "Overcoming Catastrophic Forgetting in Zero-Shot Cross-Lingual Generation", "authors": "Tu Vu, Aditya Barua, Brian Lester, Daniel Cer, Mohit Iyyer and Noah Constant", "abstract": "In this paper, we explore the challenging problem of performing a generative task in a target language when labeled data is only available in English, using summarization as a case study. We assume a strict setting with no access to parallel data or machine translation and find that common transfer learning approaches struggle in this setting, as a generative multilingual model fine-tuned purely on English catastrophically forgets how to generate non-English. Given the recent rise of parameter-efficient adaptation techniques, we conduct the first investigation into how one such method, prompt tuning (Lester et al., 2021), can overcome catastrophic forgetting to enable zero-shot cross-lingual generation. Our experiments show that parameter-efficient prompt tuning provides gains over standard fine-tuning when transferring between less-related languages, e.g., from English to Thai. However, a significant gap still remains between these methods and fully-supervised baselines. To improve cross-lingual transfer further, we explore several approaches, including: (1) mixing in unlabeled multilingual data, and (2) explicitly factoring prompts into recombinable language and task components. Our approaches can provide further quality gains, suggesting that robust zero-shot cross-lingual generation is within reach.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.096649169921875, 5.16565465927124], "id": 3320, "title": "Improving Large-scale Paraphrase Acquisition and Generation", "authors": "Yao Dou, Chao Jiang and Wei Xu", "abstract": "This paper addresses the quality issues in existing Twitter-based paraphrase datasets, and discusses the necessity of using two separate definitions of paraphrase for identification and generation tasks. We present a new Multi-Topic Paraphrase in Twitter (MultiPIT) corpus that consists of a total of 130k sentence pairs with crowdsoursing (MultiPIT_crowd) and expert (MultiPIT_expert) annotations using two different paraphrase definitions for paraphrase identification, in addition to a multi-reference test set (MultiPIT_NMR) and a large automatically constructed training set (MultiPIT_Auto) for paraphrase generation. With improved data annotation quality and task-specific paraphrase definition, the best pre-trained language model fine-tuned on our dataset achieves the state-of-the-art performance of 84.2 F1 for automatic paraphrase identification. Furthermore, our empirical results also demonstrate that the paraphrase generation models trained on MultiPIT_Auto generate more diverse and high-quality paraphrases compared to their counterparts fine-tuned on other corpora such as Quora, MSCOCO, and ParaNMT.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.023735046386719, 9.056941032409668], "id": 3322, "title": "Entropy- and Distance-Based Predictors From GPT-2 Attention Patterns Predict Reading Times Over and Above GPT-2 Surprisal", "authors": "Byung-Doh Oh and William Schuler", "abstract": "Transformer-based large language models are trained to make predictions about the next word by aggregating representations of previous tokens through their self-attention mechanism. In the field of cognitive modeling, such attention patterns have recently been interpreted as embodying the process of cue-based retrieval, in which attention over multiple targets is taken to generate interference and latency during retrieval. Under this framework, this work first defines an entropy-based predictor that quantifies the diffuseness of self-attention, as well as distance-based predictors that capture the incremental change in attention patterns across timesteps. Moreover, following recent studies that question the informativeness of attention weights, we also experiment with alternative methods for incorporating vector norms into attention weights. Regression experiments using predictors calculated from the GPT-2 language model show that these predictors deliver a substantially better fit to held-out self-paced reading and eye-tracking data over a rigorous baseline including GPT-2 surprisal.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [4.876716136932373, 5.931199550628662], "id": 3340, "title": "A Survey of Computational Framing Analysis Approaches", "authors": "Mohammad Ali and Naeemul Hassan", "abstract": "Framing analysis is predominantly qualitative and quantitative, examining a small dataset with manual coding. Easy access to digital data in the last two decades prompts scholars in both computation and social sciences to utilize various computational tools to explore frames in large-scale datasets. The growing scholarship, however, lacks a comprehensive understanding and resources of computational framing analysis methods. Aiming to address the gap, this article surveys existing computational framing analysis approaches and puts them together. The research is expected to help scholars and journalists gain a deeper understanding of how frames are being explored computationally, better equip them to analyze frames in large-scale datasets, and, finally, work on advancing the methodological approaches.", "track": "Resources and Evaluation", "label": 1}, {"loc": [1.3934751749038696, 8.112939834594727], "id": 3344, "title": "Learning Cross-Task Dependencies for Joint Extraction of Entities, Events, Event Arguments, and Relations", "authors": "Minh Van Nguyen, Bonan Min, Franck Dernoncourt and Thien Huu Nguyen", "abstract": "Extracting entities, events, event arguments, and relations (i.e., task instances) from text represents four main challenging tasks in information extraction (IE), which have been solved jointly (JointIE) to boost the overall performance for IE. As such, previous work often leverages two types of dependencies between the tasks, i.e., cross-instance and cross-type dependencies representing relatedness between task instances and correlations between information types of the tasks. However, the cross-task dependencies in prior work are not optimal as they are only designed manually according to some task heuristics. To address this issue, we propose a novel model for JointIE that aims to learn cross-task dependencies from data. In particular, we treat each task instance as a node in a dependency graph where edges between the instances are inferred through information from different layers of a pretrained language model (e.g., BERT). Furthermore, we utilize the Chow-Liu algorithm to learn a dependency tree between information types for JointIE by seeking to approximate the joint distribution of the types from data. Finally, the Chow-Liu dependency tree is used to generate cross-type patterns, serving as anchor knowledge to guide the learning of representations and dependencies between instances for JointIE. Experimental results show that our proposed model significantly outperforms strong JointIE baselines over four datasets with different languages.", "track": "Information Extraction", "label": 5}, {"loc": [4.244668483734131, 7.392948627471924], "id": 3352, "title": "Don't Copy the Teacher: Data and Model Challenges in Embodied Dialogue", "authors": "So Yeon Min, Hao Zhu, Ruslan Salakhutdinov and Yonatan Bisk", "abstract": "Embodied dialogue instruction following requires an agent to complete a complex sequence of tasks from a natural language exchange. The recent introduction of benchmarks raises the question of how best to train and evaluate models for this multi-turn, multi-agent, long-horizon task. This paper contributes to that conversation, by arguing that imitation learning (IL) and related low-level metrics are actually misleading and do not align with the goals of embodied dialogue research and may hinder progress.\n\nWe provide empirical comparisons of metrics, analysis of three models, and make suggestions for how the field might best progress. First, we observe that models trained with IL take spurious actions during evaluation. Second, we find that existing models fail to ground query utterances, which are essential for task completion. Third, we argue evaluation should focus on higher-level semantic goals. We will release code to additionally filter the data and benchmark models for improved evaluation.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.6887712478637695, 10.785508155822754], "id": 3353, "title": "ALFRED-L: Investigating the Role of Language for Action Learning in Interactive Visual Environments", "authors": "Arjun Akula, Spandana Gella, Aishwarya Padmakumar, Mahdi Namazifar, Mohit Bansal, Jesse Thomason and Dilek Hakkani-Tur", "abstract": "Embodied Vision and Language Task Completion requires an embodied agent to interpret natural language instructions and egocentric visual observations to navigate through and interact with environments. In this work, we examine ALFRED, a challenging benchmark for embodied task completion, with the goal of gaining insight into how effectively models utilize language. We find evidence that sequence-to-sequence and transformer-based models trained on this benchmark are not sufficiently sensitive to changes in input language instructions. Next, we construct a new test split -- ALFRED-L to test whether ALFRED models can generalize to task structures not seen during training that intuitively require the same types of language understanding required in ALFRED. Evaluation of existing models on ALFRED-L suggests that (a) models are overly reliant on the sequence in which objects are visited in typical ALFRED trajectories and fail to adapt to modifications of this sequence and (b) models trained with additional augmented trajectories are able to adapt relatively better to such changes in input language instructions.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.124453544616699, 7.540177822113037], "id": 3357, "title": "Dungeons and Dragons as a Dialog Challenge for Artificial Intelligence", "authors": "Chris Callison-Burch, Gaurav Singh Tomar, Lara Martin, Daphne Ippolito, Suma Bailis and David Reitter", "abstract": "AI researchers have posited Dungeons and Dragons (D&D) as a challenge problem to test systems on various language-related capabilities. In this paper, we frame D&D specifically as a dialogue system challenge, where the tasks are to both generate the next conversational turn in the game and predict the state of the game given the dialogue history. We create a gameplay dataset consisting of nearly 900 games, with a total of 7,000 players, 800,000 dialogue turns, 500,000 dice rolls, and 58 million words. We automatically annotate the data with partial state information about the game play. We train a large language model (LM) to generate the next game turn, conditioning it on different information. The LM can respond as a particular character or as the player who runs the game\u2014i.e., the Dungeon Master (DM). It is trained to produce dialogue that is either in-character (roleplaying in the fictional world) or out-of-character (discussing rules or strategy). We perform a human evaluation to determine what factors make the generated output plausible and interesting. We further perform an automatic evaluation to determine how well the model can predict the game state given the history and examine how well tracking the game state improves its ability to produce plausible conversational output.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.5910078287124634, 8.584247589111328], "id": 3371, "title": "Unsupervised Entity Linking with Guided Summarization and Multiple-Choice Selection", "authors": "Young Min Cho, Li Zhang and Chris Callison-Burch", "abstract": "Entity linking, the task of linking potentially ambiguous mentions in texts to corresponding knowledge-base entities, is an important component for language understanding. We address two challenge in entity linking: how to leverage wider contexts surrounding a mention, and how to deal with limited training data. We propose a fully unsupervised model called SumMC that first generates a guided summary of the contexts conditioning on the mention, and then casts the task to a multiple-choice problem where the model chooses an entity from a list of candidates. In addition to evaluating our model on existing datasets that focus on named entities, we create a new dataset that links noun phrases from WikiHow to Wikidata. We show that our SumMC model achieves state-of-the-art unsupervised performance on our new dataset and on exiting datasets.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.575639724731445, 11.72941780090332], "id": 3376, "title": "Weakly-Supervised Temporal Article Grounding", "authors": "Long Chen, Yulei Niu, Brian Chen, Xudong Lin, Guangxing Han, Christopher Thomas, Hammad Ayyubi, Heng Ji and Shih-Fu Chang", "abstract": "Given a long untrimmed video and natural language queries, video grounding (VG) aims to temporally localize the semantically-aligned video segments. Almost all existing VG work holds two simple but unrealistic assumptions: 1) All query sentences can be grounded in the corresponding video. 2) All query sentences for the same video are always at the same semantic scale. Unfortunately, both assumptions make today's VG models fail to work in practice. For example, in real-world multimodal assets (eg, news articles), most of the sentences in the article can not be grounded in their affiliated videos, and they typically have rich hierarchical relations (ie, at different semantic scales). To this end, we propose a new challenging grounding task: Weakly-Supervised temporal Article Grounding (WSAG). Specifically, given an article and a relevant video, WSAG aims to localize all ``groundable'' sentences to the video, and these sentences are possibly at different semantic scales. Accordingly, we collect the first WSAG dataset to facilitate this task: YouwikiHow, which borrows the inherent multi-scale descriptions in wikiHow articles and plentiful YouTube videos. In addition, we propose a simple but effective method DualMIL for WSAG, which consists of a two-level MIL loss and a single-/cross- sentence constraint loss. These training objectives are carefully designed for these relaxed assumptions. Extensive ablations have verified the effectiveness of DualMIL.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [2.531998634338379, 4.610751628875732], "id": 3378, "title": "Exploring Dual Encoder Architectures for Question Answering", "authors": "Zhe Dong, Jianmo Ni, Dan Bikel, Enrique Alfonseca, Yuan Wang, Chen Qu and Imed Zitouni", "abstract": "Dual encoders have been used for question-answering (QA) and information retrieval (IR) tasks with good results. There are two major types of dual encoders, Siamese Dual Encoders (SDE), with parameters shared across two encoders, and Asymmetric Dual Encoder (ADE), with two distinctly parameterized encoders. In this work, we explore the dual encoder architectures for QA retrieval tasks. By evaluating on MS MARCO, open domain NQ, and the MultiReQA benchmarks, we show that SDE performs significantly better than ADE. We further propose three different improved versions of ADEs. Based on the evaluation of QA retrieval tasks and direct analysis of the embeddings, we demonstrate that sharing parameters in projection layers would enable ADEs to perform competitively with SDEs.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [6.19614315032959, 7.879395484924316], "id": 3385, "title": "arXivEdits: Understanding the Human Revision Process in Scientific Writing", "authors": "Chao Jiang, Wei Xu and Samuel Stevens", "abstract": "Scientific publications are the primary means to communicate research discoveries, where the writing quality is of crucial importance. However, prior work studying the human editing process in this domain mainly focused on the abstract or introduction sections, resulting in an incomplete picture. In this work, we provide a complete computational framework for studying text revision for scientific writing. We first introduce arXivEdits, a new annotated corpus of 751 full papers from arXiv with gold sentence alignment across their multiple versions of revision, as well as fine-grained span-level edits and their underlying intentions for 1,000 sentence pairs. It supports our data-driven analysis to unveil the common strategies practiced by researchers for revising their papers. To scale up the analysis, we also develop automatic methods to extract revision at document-, sentence-, and word-levels. A neural CRF sentence alignment model trained on our corpus achieves 93.8 F1, enabling the reliable matching of sentences between different versions. We formulate the edit extraction task as a span alignment problem, and our proposed method extracts more fine-grained and explainable edits, compared to the commonly used diff algorithm. An intention classifier trained on our dataset achieves 78.9 F1 on the fine-grained intent classification task. Our data and systems are released at tiny.one/arxivedits.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.9896295070648193, 8.993953704833984], "id": 3390, "title": "Why Do You Feel This Way? Summarizing Triggers of Emotions in Social Media Posts", "authors": "Hongli Zhan, Tiberiu Sosea, Cornelia Caragea and Junyi Jessy Li", "abstract": "Crises such as the COVID-19 pandemic continuously threaten our world and emotionally affect billions of people worldwide in distinct ways. Understanding the triggers leading to people's emotions is of crucial importance. Social media posts can be a good source of such analysis, yet these texts tend to be charged with multiple emotions, with triggers scattering across multiple sentences. This paper takes a novel angle, namely, emotion detection and trigger summarization, aiming to both detect perceived emotions in text, and summarize events and their appraisals that trigger each emotion. To support this goal, we introduce CovidET (Emotions and their Triggers during Covid-19), a dataset of ~1,900 English Reddit posts related to COVID-19, which contains manual annotations of perceived emotions and abstractive summaries of their triggers described in the post. We develop strong baselines to jointly detect emotions and summarize emotion triggers. Our analyses show that CovidET presents new challenges in emotion-specific summarization, as well as multi-emotion detection in long social media posts.", "track": "Summarization", "label": 14}, {"loc": [4.0564093589782715, 3.9607183933258057], "id": 3393, "title": "Analogical Math Word Problems Solving with Enhanced Problem-Solution Association", "authors": "Zhenwen Liang, Jipeng Zhang and Xiangliang Zhang", "abstract": "Math word problem (MWP) solving is an important task in question answering which requires human-like reasoning ability. Analogical reasoning has long been used in mathematical education, as it enables students to apply common relational structures of mathematical situations to solve new problems. In this paper, we propose to build a novel MWP solver by leveraging analogical MWPs, which advance the solver's generalization ability across different kinds of MWPs. The key idea, named analogy identification, is to associate the analogical MWP pairs in a latent space, i.e., encoding an MWP close to another analogical MWP, while leaving away from the non-analogical ones. Moreover, a solution discriminator is integrated into the MWP solver to enhance the association between an MWP and its true solution. The evaluation results verify that our proposed analogical learning strategy promotes the performance of MWP-BERT on Math23k over the state-of-the-art model Generate2Rank, with 5 times fewer parameters in the encoder. We also find that our model has a stronger generalization ability in solving difficult MWPs due to the analogical learning from easy MWPs.", "track": "Question Answering", "label": 11}, {"loc": [3.8607699871063232, 4.542219161987305], "id": 3399, "title": "Towards Teachable Reasoning Systems: Using a Dynamic Memory of User Feedback for Continual System Improvement", "authors": "Bhavana Dalvi Mishra, Oyvind Tafjord and Peter Clark", "abstract": "Our goal is a teachable reasoning system for question-answering (QA), where a user can interact with faithful answer explanations, and correct its errors so that the system improves over time. Our approach is to augment a QA model with a dynamic memory of user feedback, containing user-supplied corrections to\nerroneous model beliefs that users identify during interaction. Retrievals from memory are used as additional context for QA, to help avoid previous mistakes in similar new situations - a novel application of memory-based continuous learning. With simulated feedback, we find that our system (called TeachMe) continually improves with time, and without model retraining, requiring feedback on only 25% of training examples to reach within 1% of the upper-bound (feedback on all examples). Similarly, in experiments with real users, we observe a similar trend, with performance improving by over 15% on a hidden test set after teaching. This suggests new opportunities for using frozen language models in an interactive setting where users can inspect, debug, and correct the model's beliefs, leading to improved system's performance over time.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [2.9380671977996826, 4.642478942871094], "id": 3436, "title": "Knowledge Transfer from Answer Ranking to Answer Generation", "authors": "Matteo Gabburo, Rik Koncel-Kedziorski, Siddhant Garg, Luca Soldaini and Alessandro Moschitti", "abstract": "Recent studies show that Question Answering (QA) based on Answer Sentence Selection (AS2) can be improved by generating an improved answer from the top-k ranked answer sentences (termed GenQA). This allows for synthesizing the information from multiple candidates into a concise, natural-sounding answer. However, creating large-scale supervised training data for GenQA models is very challenging. In this paper, we propose to train a GenQA model by transferring knowledge from a trained AS2 model, to overcome the aforementioned issue. First, we use an AS2 model to produce a ranking over answer candidates for a set of questions. Then, we use the top ranked candidate as the generation target, and the next k top ranked candidates as context for training a GenQA model. We also propose to use the AS2 model prediction scores for loss weighting and score-conditioned input/output shaping, to aid the knowledge transfer. Our evaluation on three public and one large industrial datasets demonstrates the superiority of our approach over the AS2 baseline, and GenQA trained using supervised data.", "track": "Question Answering", "label": 11}, {"loc": [6.293747901916504, 5.413885593414307], "id": 3450, "title": "Perturbation Augmentation for Fairer NLP", "authors": "Rebecca Qian, Candace Ross, Jude Fernandes, Eric Michael Smith, Douwe Kiela and Adina Williams", "abstract": "Unwanted and often harmful social biases are becoming ever more salient in NLP research, affecting both models and datasets. In this work, we ask whether training on demographically perturbed data leads to fairer language models. We collect a large dataset of human annotated text perturbations and train a neural perturbation model, which we show outperforms heuristic alternatives. We find that (i) language models (LMs) pre-trained on demographically perturbed corpora are typically more fair, and (ii) LMs finetuned on perturbed GLUE datasets exhibit less demographic bias on downstream tasks, and (iii) fairness improvements do not come at the expense of performance on downstream tasks. Lastly, we discuss outstanding questions about how best to evaluate the (un)fairness of large language models. We hope that this exploration of neural demographic perturbation will help drive more improvement towards fairer NLP.", "track": "Ethics", "label": 21}, {"loc": [6.624654293060303, 8.357181549072266], "id": 3451, "title": "Automatic Document Selection for Efficient Encoder Pretraining", "authors": "Yukun Feng, Patrick Xia, Benjamin Van Durme and Jo\u00e3o Sedoc", "abstract": "Building pretrained language models is considered expensive and data-intensive, but must we increase dataset size to achieve better performance? We propose an alternative to larger training sets by automatically identifying smaller yet domain-representative subsets. We extend Cynical Data Selection, a statistical sentence scoring method that conditions on a representative target domain corpus. As an example, we treat the OntoNotes corpus as a target domain and pretrain a RoBERTa-like encoder from a cynically selected subset of the Pile. On both perplexity and across several downstream tasks in the target domain, it consistently outperforms random selection with 20x less data, 3x fewer training iterations, and 2x less estimated cloud compute cost, validating the recipe of automatic document selection for LM pretraining.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.069704055786133, 12.048501014709473], "id": 3453, "title": "The Aligned Multimodal Movie Treebank: An audio, video, dependency-parse treebank", "authors": "Adam Yaari, Jan DeWitt, Henry Hu, Bennett Stankovits, Sue Felshin, Yevgeni Berzak, Helena Aparicio, Boris Katz, Ignacio Cases and Andrei Barbu", "abstract": "Treebanks have traditionally included only text and were derived from written sources such as newspapers or the web. We introduce the Aligned Multimodal Movie Treebank (AMMT), an English language treebank derived from dialog in Hollywood movies which includes transcriptions of the audio-visual streams with word-level alignment, as well as part of speech tags and dependency parses in the Universal Dependencies formalism. AMMT consists of 31,264 sentences and 218,090 words, that will amount to the 3rd largest UD English treebank and the only multimodal treebank in UD. To help with the web-based annotation effort, we also introduce the Efficient Audio Alignment Annotator (EAAA), a companion tool that enables annotators to significantly speed-up their annotation processes.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.940399169921875, 6.7785210609436035], "id": 3460, "title": "\ud83c\udf3e DEMETR: Diagnosing Evaluation Metrics for Translation", "authors": "Marzena Karpinska, Nishant Raj, Katherine Thai, Yixiao Song, Ankita Gupta and Mohit Iyyer", "abstract": "While machine translation evaluation metrics based on string overlap (e.g., BLEU) have their limitations, their computations are transparent: the BLEU score assigned to a particular candidate translation can be traced back to the presence or absence of certain words. The operations of newer learned metrics (e.g., BLEURT, COMET), which leverage pretrained language models to achieve higher correlations with human quality judgments than BLEU, are opaque in comparison. In this paper, we shed light on the behavior of these learned metrics by creating DEMETR, a diagnostic dataset with 31K English examples (translated from 10 source languages) for evaluating the sensitivity of MT evaluation metrics to 35 different linguistic perturbations spanning semantic, syntactic, and morphological error categories. All perturbations were carefully designed to form minimal pairs with the actual translation (i.e., differ in only one aspect). We find that learned metrics perform substantially better than string-based metrics on DEMETR. Additionally, learned metrics differ in their sensitivity to various phenomena (e.g., BERTScore is sensitive to untranslated words but relatively insensitive to gender manipulation, while COMET is much more sensitive to word repetition than to aspectual changes). We publicly release DEMETR to spur more informed future development of machine translation evaluation metrics", "track": "Resources and Evaluation", "label": 1}, {"loc": [1.6587108373641968, 5.436682224273682], "id": 3478, "title": "Empowering Language Models with Knowledge Graph Reasoning for Open-Domain Question Answering", "authors": "Ziniu Hu, Yichong Xu, Wenhao Yu, Shuohang Wang, Ziyi Yang, Chenguang Zhu, Kai-Wei Chang and Yizhou Sun", "abstract": "Answering open-domain questions requires world knowledge about in-context entities. As pre-trained Language Models (LMs) lack the power to store all required knowledge, external knowledge sources, such as knowledge graphs, are often used to augment LMs. \nIn this work, we propose knOwledge REasOning empowered Language Model\n(OREO-LM), which consists of a novel Knowledge Interaction Layer that can be flexibly plugged into existing Transformer-based LMs to interact with a differentiable Knowledge Graph Reasoning module collaboratively. In this way, LM guides KG to walk towards the desired answer, while the retrieved knowledge improves LM.\nBy adopting OREO-LM to RoBERTa and T5, we show significant performance gain, achieving state-of-art results in the Closed-Book setting. The performance enhancement is mainly from the KG reasoning's capacity to infer missing relational facts. In addition, OREO-LM provides reasoning paths as rationales to interpret the model's decision.", "track": "Question Answering", "label": 11}, {"loc": [6.33575439453125, 5.516512870788574], "id": 3480, "title": "Debiasing Pretrained Text Encoders by Paying Attention to Paying Attention", "authors": "Yacine Gaci, Boualem Benatallah, Fabio Casati and Khalid Benabdeslem", "abstract": "Natural Language Processing (NLP) models are found to exhibit discriminatory stereotypes across many social constructs, e.g. gender and race. In comparison to the progress made in reducing bias from static word embeddings, fairness in sentence-level text encoders received little consideration despite their wider applicability in contemporary NLP tasks. In this paper, we propose a debiasing method for pre-trained text encoders that both reduces social stereotypes, and inflicts next to no semantic damage. Unlike previous studies that directly manipulate the embeddings, we suggest to dive deeper into the operation of these encoders, and pay more attention to the way they pay attention to different social groups. We find that stereotypes are also encoded in the attention layer. Then, we work on model debiasing by redistributing the attention scores of a text encoder such that it forgets any preference to historically advantaged groups, and attends to all social classes with the same intensity. Our experiments confirm that reducing bias from attention effectively mitigates it from the model's text representations.", "track": "Ethics", "label": 21}, {"loc": [2.1277928352355957, 7.572661399841309], "id": 3486, "title": "MEE: A Novel Multilingual Event Extraction Dataset", "authors": "Amir Pouran Ben Veyseh, Javid Ebrahimi, Franck Dernoncourt and Thien Huu Nguyen", "abstract": "Event Extraction (EE) is one of the fundamental tasks in Information Extraction (IE) that aims to recognize event mentions and their arguments (i.e., participants) from text. Due to its importance, extensive methods and resources have been developed for Event Extraction. However, one limitation of current research for EE involves the under-exploration for non-English languages in which the lack of high-quality multilingual EE datasets for model training and evaluation has been the main hindrance. To address this limitation, we propose a novel Multilingual Event Extraction dataset (MEE) that provides annotation for more than 50K event mentions in 8 typologically different languages. MEE comprehensively annotates data for entity mentions, event triggers and event arguments. We conduct extensive experiments on the proposed dataset to reveal challenges and opportunities for multilingual EE. To foster future research in this direction, our dataset will be publicly available.", "track": "Ethic Concerns:Resources and Evaluation", "label": 1}, {"loc": [4.560649871826172, 4.40165901184082], "id": 3487, "title": "RobustLR: A Diagnostic Benchmark for Evaluating Logical Robustness of Deductive Reasoners", "authors": "Soumya Sanyal, Zeyi Liao and Xiang Ren", "abstract": "Transformers have been shown to be able to perform deductive reasoning on inputs containing rules and statements written in the English natural language. However, it is unclear if these models indeed follow rigorous logical reasoning to arrive at the prediction or rely on spurious correlation patterns in making decisions. A strong deductive reasoning model should consistently understand the semantics of different logical operators. To this end, we present RobustLR, a diagnostic benchmark that evaluates the robustness of language models to minimal logical edits in the inputs and different logical equivalence conditions. In our experiments with RoBERTa, T5, and GPT3 we show that the models trained on deductive reasoning datasets do not perform consistently on the RobustLR test set, thus showing that the models are not robust to our proposed logical perturbations. Further, we observe that the models find it especially hard to learn logical negation operators. Our results demonstrate the shortcomings of current language models in logical reasoning and call for the development of better inductive biases to teach the logical semantics to language models. All the datasets and code base have been made publicly available.", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.773984670639038, 9.92845630645752], "id": 3488, "title": "Evaluating and Improving Factuality in Multimodal Abstractive Summarization", "authors": "David Wan and Mohit Bansal", "abstract": "Current metrics for evaluating factuality for abstractive document summarization have achieved high correlations with human judgment, but they do not account for the vision modality and thus are not adequate for vision-and-language summarization. We propose CLIPBERTSCORE, a simple weighted combination of CLIPScore and BERTScore to leverage the robustness and strong factuality detection performance between image-summary and document-summary, respectively. Next, due to the lack of meta-evaluation benchmarks to evaluate the quality of multimodal factuality metrics, we collect human judgments of factuality with respect to documents and images. We show that this simple combination of two metrics in the zero-shot setting achieves higher correlations than existing factuality metrics for document summarization, outperforms an existing multimodal summarization metric, and performs competitively with strong multimodal factuality metrics specifically fine-tuned for the task. Our thorough analysis demonstrates the robustness and high correlation of CLIPBERTSCORE and its components on four factuality metric-evaluation benchmarks. Finally, we demonstrate two practical downstream applications of our CLIPBERTSCORE metric: for selecting important images to focus on during training, and as a reward for reinforcement learning to improve factuality of multimodal summary generation w.r.t automatic and human evaluation.", "track": "Summarization", "label": 14}, {"loc": [3.7909786701202393, 9.949275016784668], "id": 3518, "title": "Referee: Reference-Free Sentence Summarization with Sharper Controllability through Symbolic Knowledge Distillation", "authors": "Melanie Sclar, Peter West, Sachin Kumar, Yulia Tsvetkov and Yejin Choi", "abstract": "We present Referee, a novel framework for sentence summarization that can be trained reference-free (i.e., requiring no gold summaries for supervision), while allowing direct control for compression ratio. Our work is the first to demonstrate that reference-free, controlled sentence summarization is feasible via the conceptual framework of Symbolic Knowledge Distillation (West et al., 2022), where latent knowledge in pre-trained language models is distilled via explicit examples sampled from the teacher models, further purified with three types of filters: length, fidelity, and Information Bottleneck. Moreover, we uniquely propose iterative distillation of knowledge, where student models from the previous iteration of distillation serve as teacher models in the next iteration. Starting off from a relatively modest set of GPT3-generated summaries, we demonstrate how iterative knowledge distillation can lead to considerably smaller, but better summarizers with sharper controllability. A useful by-product of this iterative distillation process is a high-quality dataset of sentence-summary pairs with varying degrees of compression ratios. Empirical results demonstrate that the final student models vastly outperform the much larger GPT3-Instruct model in terms of the controllability of compression ratios, without compromising the quality of resulting summarization.", "track": "Summarization", "label": 14}, {"loc": [8.260405540466309, 6.632893085479736], "id": 3519, "title": "Algorithms for Weighted Pushdown Automata", "authors": "Alexandra Butoi, Brian DuSell, Tim Vieira, Ryan Cotterell and David Chiang", "abstract": "Weighted pushdown automata (WPDAs) are at the core of many natural language processing tasks, like syntax-based statistical machine translation and transition-based dependency parsing. As most existing dynamic programming algorithms are designed for context-free grammars (CFGs), algorithms for PDAs often resort to a PDA-to-CFG conversion. In this paper, we develop novel algorithms that operate directly on WPDAs. Our algorithms are inspired by Lang's algorithm, but use a more general definition of pushdown automaton and either reduce the space requirements by a factor of |Gamma| (the size of the stack alphabet) or reduce the runtime by a factor of more than |Q| (the number of states). When run on the same class of PDAs as Lang's algorithm, our algorithm is both more space-efficient by a factor of |Gamma| and more time-efficient by a factor of |Q| x |Gamma|.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [6.22494649887085, 5.49954891204834], "id": 3529, "title": "MABEL: Attenuating Gender Bias using Textual Entailment Data", "authors": "Jacqueline He, Mengzhou Xia, Christiane Fellbaum and Danqi Chen", "abstract": "Pre-trained language models encode undesirable social biases, which are further exacerbated in downstream use. To this end, we propose MABEL (a Method for Attenuating Gender Bias using Entailment Labels), an intermediate pre-training approach for mitigating gender bias in contextualized representations. Key to our approach is the use of a contrastive learning objective on counterfactually augmented, gender-balanced entailment pairs from natural language inference (NLI) datasets. We also introduce an alignment regularizer that pulls identical entailment pairs along opposite gender directions closer. We extensively evaluate our approach on intrinsic and extrinsic metrics, and show that MABEL outperforms previous task-agnostic debiasing approaches in terms of fairness. It also preserves task performance after fine-tuning on downstream tasks. Together, these findings demonstrate the suitability of NLI data as an effective means of bias mitigation, as opposed to only using unlabeled sentences in the literature. Finally, we identify that existing approaches often use evaluation settings that are insufficient or inconsistent. We make an effort to reproduce and compare previous methods, and call for unifying the evaluation settings across gender debiasing methods for better future comparison.", "track": "Ethics", "label": 21}, {"loc": [4.536043167114258, 4.406488418579102], "id": 3539, "title": "Breakpoint Transformers for Modeling and Tracking Intermediate Beliefs", "authors": "Kyle Richardson, Ronen Tamari, Oren Sultan, Dafna Shahaf, Reut Tsarfaty and Ashish Sabharwal", "abstract": "Can we teach models designed for language understanding tasks to track and improve their beliefs through intermediate points in text? Besides making their inner workings more transparent, this would also help make models more reliable and consistent. To this end, we propose a representation learning framework called breakpoint modeling that allows for efficient and robust learning of this type. Given any text encoder and data marked with intermediate states (breakpoints) along with corresponding textual queries viewed as true/false propositions (i.e., the candidate intermediate beliefs of a model), our approach trains models in an efficient and end-to-end fashion to build intermediate representations that facilitate direct querying and training of beliefs at arbitrary points in text, alongside solving other end-tasks. We evaluate breakpoint modeling on a diverse set of NLU tasks including relation reasoning on Cluttr and narrative understanding on bAbI. Using novel proposition prediction tasks alongside these end-tasks, we show the benefit of our T5-based breakpoint transformer over strong conventional representation learning approaches in terms of processing efficiency, belief accuracy, and belief consistency, all with minimal to no degradation on the end-task. To show the feasibility of incorporating our belief tracker into more complex reasoning pipelines, we also obtain state-of-the-art performance on the three-tiered reasoning challenge for the recent TRIP benchmark (23-32% absolute improvement on Tasks 2-3).", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.280261039733887, 12.314812660217285], "id": 3548, "title": "Late Fusion with Triplet Margin Objective for Multimodal Ideology Prediction and Analysis", "authors": "Changyuan Qiu, Winston Wu, Xinliang Frederick Zhang and Lu Wang", "abstract": "Prior work on ideology prediction has largely focused on single modalities, i.e., text or images. In this work, we introduce the task of multimodal ideology prediction, where a model predicts binary or five-point scale ideological leanings, given a text-image pair with political content. We first collect five new large-scale datasets with English documents and images along with their ideological leanings, covering news articles from a wide range of mainstream media in US and social media posts from Reddit and Twitter. We conduct in-depth analyses on news articles and reveal differences in image content and usage across the political spectrum. Furthermore, we perform extensive experiments and ablation studies, demonstrating the effectiveness of targeted pretraining objectives on different model components. Our best-performing model, a late-fusion architecture pretrained with a triplet objective over multimodal content, outperforms the state-of-the-art text-only model by almost 4% and a strong multimodal baseline with no pretraining by over 3%.", "track": "Ethic Concerns:Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [3.1196937561035156, 4.853527545928955], "id": 3552, "title": "Leveraging QA Datasets to Improve Generative Data Augmentation", "authors": "Dheeraj Mekala, Tu Vu, Timo Schick and Jingbo Shang", "abstract": "The ability of generative language models (GLMs) to generate text has improved considerably in the last few years, enabling their use for generative data augmentation. In this work, we propose CONDA, an approach to further improve GLM's ability to generate synthetic data by reformulating data generation as context generation for a given question-answer (QA) pair and leveraging QA datasets for training context generators. Then, we cast downstream tasks into the same question answering format and adapt the fine-tuned context generators to the target task domain. Finally, we use the fine-tuned GLM to generate relevant contexts, which are in turn used as synthetic training data for their corresponding tasks. We perform extensive experiments on multiple classification datasets and demonstrate substantial improvements in performance for both few- and zero-shot settings. Our analysis reveals that QA datasets that require high-level reasoning abilities (e.g., abstractive and common-sense QA datasets) tend to give the best boost in performance in both few-shot and zero-shot settings.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [8.76574993133545, 8.01485538482666], "id": 3561, "title": "Meta-Learning Fast Weight Language Models", "authors": "Kevin Clark, Kelvin Guu, Ming-Wei Chang, Panupong Pasupat, Geoffrey Hinton and Mohammad Norouzi", "abstract": "Dynamic evaluation of language models (LMs) adapts model parameters at test time using gradient information from previous tokens and substantially improves LM performance. However, it requires over 3x more compute than standard inference. We present Fast Weight Layers (FWLs), a neural component that provides the benefits of dynamic evaluation much more efficiently by expressing gradient updates as linear attention. A key improvement over dynamic evaluation is that FWLs can also be applied at training time, so the model learns to make good use of gradient updates. FWLs can easily be added on top of existing transformer models, require relatively little extra compute or memory to run, and significantly improve language modeling perplexity.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.351504325866699, 7.093067169189453], "id": 3565, "title": "CTL++: Evaluating Generalization on Never-Seen Compositional Patterns of Known Functions, and Compatibility of Neural Representations", "authors": "R\u00f3bert Csord\u00e1s, Kazuki Irie and Juergen Schmidhuber", "abstract": "Well-designed diagnostic tasks have played a key role in studying the failure of neural nets (NNs) to generalize systematically. Famous examples include SCAN and Compositional Table Lookup (CTL). Here we introduce CTL++, a new diagnostic dataset based on compositions of unary symbolic functions. While the original CTL is used to test length generalization or productivity, CTL++ is designed to test systematicity of NNs, that is, their capability to generalize to unseen compositions of known functions. CTL++ splits functions into groups and tests performance on group elements composed in a way not seen during training. We show that recent CTL-solving Transformer variants fail on CTL++. The simplicity of the task design allows for fine-grained control of task difficulty, as well as many insightful analyses. For example, we measure how much overlap between groups is needed by tested NNs for learning to compose. We also visualize how learned symbol representations in outputs of functions from different groups are compatible in case of success but not in case of failure. These results provide insights into failure cases reported on more complex compositions in the natural language domain. Our code is public.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.8445467948913574, 9.875969886779785], "id": 3568, "title": "Learning with Rejection for Abstractive Text Summarization", "authors": "Meng Cao, Yue Dong, Jingyi He and Jackie Chi Kit Cheung", "abstract": "State-of-the-art abstractive summarization systems frequently hallucinate content that is not supported by the source document, mainly due to noise in the training dataset.\nExisting methods opt to drop the noisy samples or tokens from the training set entirely, reducing the effective training set size and creating an artificial propensity to copy words from the source. \nIn this work, we propose a training objective for abstractive summarization based on rejection learning, in which the model learns whether or not to reject potentially noisy tokens. We further propose a regularized decoding objective that penalizes non-factual candidate summaries during inference by using the rejection probability learned during training.\nWe show that our method considerably improves the factuality of generated summaries in automatic and human evaluations when compared to five baseline models, and that it does so while increasing the abstractiveness of the generated summaries.", "track": "Summarization", "label": 14}, {"loc": [7.621102333068848, 7.825701713562012], "id": 3573, "title": "Adaptive Label Smoothing with Self-Knowledge in Natural Language Generation", "authors": "Dongkyu Lee, Ka Chun Cheung and Nevin L. Zhang", "abstract": "Overconfidence has been shown to impair generalization and calibration of a neural network. Previous studies remedy this issue by adding a regularization term to a loss function, preventing a model from making a peaked distribution. Label smoothing smoothes target labels with a pre-defined prior label distribution; as a result, a model is learned to maximize the likelihood of predicting the soft label. Nonetheless, the amount of smoothing is the same in all samples and remains fixed in training. In other words, label smoothing does not reflect the change in probability distribution mapped by a model over the course of training. To address this issue, we propose a regularization scheme that brings dynamic nature into the smoothing parameter by taking model probability distribution into account, thereby varying the parameter per instance. A model in training self-regulates the extent of smoothing on the fly during forward propagation. Furthermore, inspired by recent work in bridging label smoothing and knowledge distillation, our work utilizes self-knowledge as a prior label distribution in softening target labels, and presents theoretical support for the regularization effect by knowledge distillation and the dynamic smoothing parameter. Our regularizer is validated comprehensively, and the result illustrates marked improvements in model generalization and calibration, enhancing robustness and trustworthiness of a model.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.767879009246826, 7.770246505737305], "id": 3581, "title": "Hard Gate Knowledge Distillation - Leverage Calibration for Robust and Reliable Language Model", "authors": "Dongkyu Lee, Zhiliang Tian, Yingxiu Zhao, Ka Chun Cheung and Nevin L. Zhang", "abstract": "In knowledge distillation, a student model is trained with supervisions from both knowledge from a teacher and observations drawn from a training data distribution. Knowledge of a teacher is considered a subject that holds inter-class relations which send a meaningful supervision to a student; hence, much effort has been put to find such knowledge to be distilled. In this paper, we explore a question that has been given little attention: \"when to distill such knowledge.\" The question is answered in our work with the concept of model calibration; we view a teacher model not only as a source of knowledge but also as a gauge to detect miscalibration of a student. This simple and yet novel view leads to a hard gate knowledge distillation scheme that switches between learning from a teacher model and training data. We verify the gating mechanism in the context of natural language generation at both the token-level and the sentence-level. Empirical comparisons with strong baselines show that hard gate knowledge distillation not only improves model generalization, but also significantly lowers model calibration error.", "track": "Natural Language Generation", "label": 6}, {"loc": [5.960874557495117, 5.220243453979492], "id": 3596, "title": "Are All Spurious Features in Natural Language Alike? An Analysis through a Causal Lens", "authors": "Nitish Joshi, Xiang Pan and He He", "abstract": "The term \u2018spurious correlations' has been used in NLP to informally denote any undesirable feature-label correlations. However, a correlation can be undesirable because (i) the feature is irrelevant to the label (e.g. punctuation in a review), or (ii) the feature's effect on the label depends on the context (e.g. negation words in a review), which is ubiquitous in language tasks. In case (i), we want the model to be invariant to the feature, which is neither necessary nor sufficient for prediction. But in case (ii), even an ideal model (e.g. humans) must rely on the feature, since it is necessary (but not sufficient) for prediction. Therefore, a more fine-grained treatment of spurious features is needed to specify the desired model behavior. We formalize this distinction using a causal model and probabilities of necessity and sufficiency, which delineates the causal relations between a feature and a label. We then show that this distinction helps explain results of existing debiasing methods on different spurious features, and demystifies surprising results such as the encoding of spurious features in model representations after debiasing.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [3.9087600708007812, 9.773994445800781], "id": 3602, "title": "Correcting Diverse Factual Errors in Abstractive Summarization via Post-Editing and Language Model Infilling", "authors": "Vidhisha Balachandran, Hannaneh Hajishirzi, William Cohen and Yulia Tsvetkov", "abstract": "Abstractive summarization models often generate inconsistent summaries containing factual errors or hallucinated content. Recent works focus on correcting factual errors in generated summaries via post-editing. Such correction models are trained using adversarial non-factual summaries constructed using heuristic rules for injecting errors. However, generating non-factual summaries using heuristics often does not generalize well to actual model errors. In this work, we propose to generate hard, representative synthetic examples of non-factual summaries through infilling language models. With this data, we train a more robust fact-correction model to post-edit the summaries to improve factual consistency. Through quantitative and qualitative experiments on two popular summarization datasets--- CNN/DM and XSum---we show that our approach vastly outperforms prior methods in correcting erroneous summaries. Our model---FactEdit---improves factuality scores by over ~11 points \non CNN/DM and over ~31 points on XSum on average across multiple summarization models, producing more factual summaries while maintaining competitive summarization quality.", "track": "Summarization", "label": 14}, {"loc": [6.900367259979248, 9.903382301330566], "id": 3621, "title": "Coordinated Topic Modeling", "authors": "Pritom Saha Akash, Jie Huang and Kevin Chang", "abstract": "We propose a new problem called coordinated topic modeling that imitates human behavior while describing a text corpus. It considers a set of well-defined topics like the axes of a semantic space with a reference representation. It then uses the axes to model a corpus for easily understandable representation. This new task helps represent a corpus more interpretably by reusing existing knowledge and benefits the corpora comparison task. We design ECTM, an embedding-based coordinated topic model that effectively uses the reference representation to capture the target corpus-specific aspects while maintaining each topic's global semantics. In ECTM, we introduce the topic- and document-level supervision with a self-training mechanism to solve the problem. Finally, extensive experiments on multiple domains show the superiority of our model over other baselines.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [1.8579953908920288, 3.9646143913269043], "id": 3627, "title": "Large Dual Encoders Are Generalizable Retrievers", "authors": "Jianmo Ni, Chen Qu, Jing Lu, Zhuyun Dai, Gustavo Hernandez Abrego, Ji Ma, Vincent Zhao, Yi Luan, Keith Hall, Ming-Wei Chang and Yinfei Yang", "abstract": "It has been shown that dual encoders trained on one domain often fail to generalize to other domains for retrieval tasks. One widespread belief is that the bottleneck layer of a dual encoder, where the final score is simply a dot-product between a query vector and a passage vector, is too limited compared to models with fine-grained interactions between the query and the passage. In this paper, we challenge this belief by scaling up the size of the dual encoder model {\\em while keeping the bottleneck layer as a single dot-product with a fixed size.} With multi-stage training, scaling up the model size brings significant improvement on a variety of retrieval tasks, especially for out-of-domain generalization. We further analyze the impact of the bottleneck layer and demonstrate diminishing improvement when scaling up the embedding size. Experimental results show that our dual encoders, \\textbf{G}eneralizable \\textbf{T}5-based dense \\textbf{R}etrievers (GTR), outperform previous sparse and dense retrievers on the BEIR dataset significantly. Most surprisingly, our ablation study finds that GTR is very data efficient, as it only needs 10\\% of MS Marco supervised data to match the out-of-domain performance of using all supervised data.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [5.51492977142334, 11.582144737243652], "id": 3631, "title": "CRIPP-VQA: Counterfactual Reasoning about Implicit Physical Properties via Video Question Answering", "authors": "Maitreya Patel, Tejas Gokhale, Chitta Baral and Yezhou Yang", "abstract": "Videos often capture objects, their visible properties, their motion, and the interactions between different objects. Objects also have physical properties such as mass, which the imaging pipeline is unable to directly capture. However, these properties can be estimated by utilizing cues from relative object motion and the dynamics introduced by collisions. In this paper, we introduce CRIPP-VQA, a new video question answering dataset for reasoning about the implicit physical properties of objects in a scene. CRIPP-VQA contains videos of objects in motion, annotated with questions that involve counterfactual reasoning about the effect of actions, questions about planning in order to reach a goal, and descriptive questions about visible properties of objects. The CRIPP-VQA test set enables evaluation under several out-of-distribution settings -- videos with objects with masses, coefficients of friction, and initial velocities that are not observed in the training distribution. Our experiments reveal a surprising and significant performance gap in terms of answering questions about implicit properties (the focus of this paper) and explicit properties of objects (the focus of prior work).", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.8489496111869812, 8.135937690734863], "id": 3641, "title": "Entity-centered Cross-document Relation Extraction", "authors": "Fengqi Wang, Fei Li, Hao Fei, Jingye Li, Shengqiong Wu, Fangfang Su, Wenxuan Shi, Donghong Ji and Bo Cai", "abstract": "Relation Extraction (RE) is a fundamental task of information extraction, which has attracted a large amount of research attention. Previous studies focus on extracting the relations within a sentence or document, while currently researchers begin to explore cross-document RE. However, current cross-document RE methods directly utilize text snippets surrounding target entities in multiple given documents, which brings considerable noisy and non-relevant sentences. Moreover, they utilize all the text paths in a document bag in a coarse-grained way, without considering the connections between these text paths.In this paper, we aim to address both of these shortages and push the state-of-the-art for cross-document RE. First, we focus on input construction for our RE model and propose an entity-based document-context filter to retain useful information in the given documents by using the bridge entities in the text paths. Second, we propose a cross-document RE model based on cross-path entity relation attention, which allow the entity relations across text paths to interact with each other. We compare our cross-document RE method with the state-of-the-art methods in the dataset CodRED. Our method outperforms them by at least 10% in F1, thus demonstrating its effectiveness.", "track": "Information Extraction", "label": 5}, {"loc": [10.884037017822266, 6.772087097167969], "id": 3642, "title": "Exploring Document-Level Literary Machine Translation with Parallel Paragraphs from World Literature", "authors": "Katherine Thai, Marzena Karpinska, Kalpesh Krishna, Bill Ray, Moira Inghilleri, John Wieting and Mohit Iyyer", "abstract": "Literary translation is a culturally significant task, but it is bottlenecked by the small number of qualified literary translators relative to the many untranslated works published around the world. Machine translation (MT) holds potential to complement the work of human translators by improving both training procedures and their overall efficiency. Literary translation is less constrained than more traditional MT settings since translators must balance meaning equivalence, readability, and critical interpretability in the target language. This property, along with the complex discourse-level context present in literary texts, also makes literary MT more challenging to computationally model and evaluate. To explore this task, we collect a dataset (Par3) of non-English language novels in the public domain, each aligned at the paragraph level to both human and automatic English translations. Using Par3, we discover that expert literary translators prefer reference human translations over machine-translated paragraphs at a rate of 84\\%, while state-of-the-art automatic MT metrics do not correlate with those preferences. The experts note that MT outputs contain not only mistranslations, but also discourse-disrupting errors and stylistic inconsistencies. To address these problems, we train a post-editing model whose output is preferred over normal MT output at a rate of 69% by experts. We publicly release Par3 to spur future research into literary MT.", "track": "Resources and Evaluation", "label": 1}, {"loc": [9.314061164855957, 6.698742389678955], "id": 3654, "title": "Label-aware Multi-level Contrastive Learning for Cross-lingual Spoken Language Understanding", "authors": "Shining Liang, Linjun Shou, Jian Pei, Ming Gong, Wanli Zuo, Xianglin Zuo and Daxin Jiang", "abstract": "Despite the great success of spoken language understanding (SLU) in high-resource languages, it remains challenging in low-resource languages mainly due to the lack of labeled training data. The recent multilingual code-switching approach achieves better alignments of model representations across languages by constructing a mixed-language context in zero-shot cross-lingual SLU. However, current code-switching methods are limited to implicit alignment and disregard the inherent semantic structure in SLU, i.e., the hierarchical inclusion of utterances, slots and words. In this paper, we propose to model the utterance-slot-word structure by a multi-level contrastive learning framework at the utterance, slot and word levels to facilitate explicit alignment. Novel code-switching schemes are introduced to generate hard negative examples for our contrastive learning framework. Furthermore, we develop a label-aware joint model leveraging label semantics to enhance the implicit alignment and feed to contrastive learning. Our experimental results show that our proposed methods significantly improve the performance compared with the strong baselines on two zero-shot cross-lingual SLU benchmark datasets.", "track": "Multilinguality", "label": 13}, {"loc": [9.016339302062988, 6.935932636260986], "id": 3658, "title": "Polyglot Prompt: Multilingual Multitask Prompt Training", "authors": "Jinlan Fu, See-Kiong Ng and Pengfei Liu", "abstract": "This paper aims for a potential architectural improvement for multilingual learning and asks: Can different tasks from different languages be modeled in a monolithic framework, i.e. without any task/language-specific module? The benefit of achieving this could open new doors for future multilingual research, including allowing systems trained on low resources to be further assisted by other languages as well as other tasks. We approach this goal by developing a learning framework named Polyglot Prompting to exploit prompting methods for learning a unified semantic space for different languages and tasks with multilingual prompt engineering. We performed a comprehensive evaluation of 6 tasks, namely topic classification, sentiment classification, named entity recognition, question answering, natural language inference, and summarization, covering 24 datasets and 49 languages. The experimental results demonstrated the efficacy of multilingual multitask prompt-based learning and led to inspiring observations. We also present an interpretable multilingual evaluation methodology and show how the proposed framework, multilingual multitask prompt training, works. We release all datasets prompted in the best setting and code.", "track": "Multilinguality", "label": 13}, {"loc": [5.006961345672607, 12.456103324890137], "id": 3672, "title": "VisToT: Vision-Augmented Table-to-Text Generation", "authors": "Prajwal Gatti, Anand Mishra, Manish Gupta and Mithun Das Gupta", "abstract": "Table-to-text generation has been widely studied in the Natural Language Processing community in the recent years. We give a new perspective to this problem by incorporating signals from both tables as well as associated images to generate relevant text. While tables contain a structured list of facts, images are a rich source of unstructured visual information. For example, in the tourism domain, images can be used to infer knowledge such as the type of landmark (e.g., church), its architecture (e.g., Ancient Roman), and composition (e.g., white marble). Therefore, in this paper, we introduce the novel task of Vision-augmented Table-To-Text Generation (VisToT, defined as follows: given a table and an associated image, produce a descriptive sentence conditioned on the multimodal input. For the task, we present a novel multimodal table-to-text dataset, WikiLandmarks, covering 73,084 unique world landmarks. Further, we also present a competitive architecture, namely, VT3 that generates accurate sentences conditioned on the image and table pairs. Through extensive analyses and experiments, we show that visual cues from images are helpful in (i) inferring missing information from incomplete or sparse tables, and (ii) strengthening the importance of useful information from noisy tables for natural language generation. We make the code and data publicly available.", "track": "Natural Language Generation", "label": 6}, {"loc": [4.619162082672119, 5.92115592956543], "id": 3676, "title": "Generative Entity-to-Entity Stance Detection with Knowledge Graph Augmentation", "authors": "Xinliang Frederick Zhang, Nick Beauchamp and Lu Wang", "abstract": "Stance detection is typically framed as predicting the sentiment in a given text towards a target entity. However, this setup overlooks the importance of the source entity, i.e., who is expressing the opinion. In this paper, we emphasize the imperative need for studying interactions among entities when inferring stances. We first introduce a new task, entity-to-entity (E2E) stance detection, which primes models to identify entities in their canonical names and discern stances jointly. To support this study, we curate a new dataset with 10,641 annotations labeled at the sentence level from news articles of different ideological leanings. We present a novel generative framework to allow the generation of canonical names for entities as well as stances among them. We further enhance the model with a graph encoder to summarize entity activities and external knowledge surrounding the entities. Experiments show that our model outperforms strong comparisons by large margins. Further analyses demonstrate the usefulness of E2E stance detection for understanding media quotation and stance landscape as well as inferring entity ideology.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [5.673434734344482, 6.2658257484436035], "id": 3694, "title": "Symptom Identification for Interpretable Detection of Multiple Mental Disorders on Social Media", "authors": "Zhiling Zhang, Siyuan Chen, Mengyue Wu and Kenny Zhu", "abstract": "Mental disease detection (MDD) from social media has suffered from poor generalizability and \ninterpretability, due to lack of symptom modeling. This paper introduces PsySym, the first annotated symptom identification corpus of multiple psychiatric disorders, to facilitate further research progress. PsySym is annotated according to a knowledge graph of the 38 symptom classes related to 7 mental diseases complied from established clinical manuals and scales, and a novel annotation framework for diversity and quality. Experiments show that symptom-assisted MDD enabled by PsySym can outperform strong pure-text baselines. We also exhibit the convincing MDD explanations provided by symptom predictions with case studies, and point to their further potential applications.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [6.043289661407471, 8.350300788879395], "id": 3713, "title": "Improving Iterative Text Revision by Learning Where to Edit from Other Revision Tasks", "authors": "Zae Myung Kim, Wanyu Du, Vipul Raheja, Dhruv Kumar and Dongyeop Kang", "abstract": "Iterative text revision improves text quality by fixing grammatical errors, rephrasing for better readability or contextual appropriateness, or reorganizing sentence structures throughout a document.\nMost recent research has focused on understanding and classifying different types of edits in the iterative revision process from human-written text instead of building accurate and robust systems for iterative text revision.\nIn this work, we aim to build an end-to-end text revision system that can iteratively generate helpful edits by explicitly detecting editable spans (where-to-edit) with their corresponding edit intents and then instructing a revision model to revise the detected edit spans.\nLeveraging datasets from other related text editing NLP tasks, combined with the specification of editable spans, leads our system to more accurately model the process of iterative text refinement, as evidenced by empirical results and human evaluations.\nOur system significantly outperforms previous baselines on our text revision tasks and other standard text revision tasks, including grammatical error correction, text simplification, sentence fusion, and style transfer.\nThrough extensive qualitative and quantitative analysis, we make vital connections between edit intentions and writing quality, and better computational modeling of iterative text revisions.", "track": "Natural Language Generation", "label": 6}, {"loc": [2.438755750656128, 4.692685604095459], "id": 3716, "title": "CONQRR: Conversational Query Rewriting for Retrieval with Reinforcement Learning", "authors": "Zeqiu Wu, Yi Luan, Hannah Rashkin, David Reitter, Hannaneh Hajishirzi, Mari Ostendorf and Gaurav Singh Tomar", "abstract": "Compared to standard retrieval tasks, passage retrieval for conversational question answering (CQA) poses new challenges in understanding the current user question, as each question needs to be interpreted within the dialogue context. Moreover, it can be expensive to re-train well-established retrievers such as search engines that are originally developed for non-conversational queries. To facilitate their use, we develop a query rewriting model CONQRR that rewrites a conversational question in the context into a standalone question. It is trained with a novel reward function to directly optimize towards retrieval using reinforcement learning and can be adapted to any off-the-shelf retriever. CONQRR achieves state-of-the-art results on a recent open-domain CQA dataset containing conversations from three different sources, and is effective for two different off-the-shelf retrievers. Our extensive analysis also shows the robustness of CONQRR to out-of-domain dialogues as well as to zero query rewriting supervision.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.37282943725586, 7.03346061706543], "id": 3735, "title": "Specializing Multi-domain NMT via Penalizing Low Mutual Information", "authors": "Jiyoung Lee, Hantae Kim, Hyunchang Cho, Edward Choi and Cheonbok Park", "abstract": "Multi-domain Neural Machine Translation (NMT) trains a single model with multiple domains. It is appealing because of its efficacy in handling multiple domains within one model. An ideal multi-domain NMT learns distinctive domain characteristics simultaneously, however, grasping the domain peculiarity is a non-trivial task. In this paper, we investigate domain-specific information through the lens of mutual information (MI) and propose a new objective that penalizes low MI to become higher.Our method achieved the state-of-the-art performance among the current competitive multi-domain NMT models. Also, we show our objective promotes low MI to be higher resulting in domain-specialized multi-domain NMT.", "track": "Machine Translation", "label": 10}, {"loc": [2.763380527496338, 7.11945915222168], "id": 3736, "title": "A Simple Contrastive Learning Framework for Interactive Argument Pair Identification via Argument-Context Extraction", "authors": "Lida Shi, fausto giunchiglia, Rui Song, daqian Shi, Tongtong Liu, Xiaolei Diao and Hao Xu", "abstract": "Interactive argument pair identification is an emerging research task for argument mining, aiming to identify whether two arguments are interactively related. It is pointed out that the context of the argument is essential to improve identification performance. However, current context-based methods achieve limited improvements since the entire context typically contains much irrelevant information. In this paper, we propose a simple contrastive learning framework to solve this problem by extracting valuable information from the context. This framework can construct hard argument-context samples and obtain a robust and uniform representation by introducing contrastive learning. We also propose an argument-context extraction module to enhance information extraction by discarding irrelevant blocks. The experimental results show that our method achieves the state-of-the-art performance on the benchmark dataset. Further analysis demonstrates the effectiveness of our proposed modules and visually displays more compact semantic representations.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.9890265464782715, 5.880360126495361], "id": 3743, "title": "Sentence-level Media Bias Analysis Informed by Discourse Structures", "authors": "Yuanyuan Lei, Ruihong Huang, Lu Wang and Nick Beauchamp", "abstract": "As polarization continues to rise among both the public and the news media, increasing attention has been devoted to detecting media bias. Most recent work in the NLP community, however, identify bias at the level of individual articles. However, each article itself comprises multiple sentences, which vary in their ideological bias. In this paper, we aim to identify sentences within an article that can illuminate and explain the overall bias of the entire article. We show that understanding the discourse role of a sentence in telling a news story, as well as its relation with nearby sentences, can reveal the ideological leanings of an author even when the sentence itself appears merely neutral. In particular, we consider using a functional news discourse structure and PDTB discourse relations to inform bias sentence identification, and distill the auxiliary knowledge from the two types of discourse structure into our bias sentence identification system. Experimental results on benchmark datasets show that incorporating both the global functional discourse structure and local rhetorical discourse relations can effectively increase the recall of bias sentence identification by 8.27% - 8.62%, as well as increase the precision by 2.82% - 3.48%.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [4.046543121337891, 7.578782081604004], "id": 3755, "title": "Towards Efficient Dialogue Pre-training with Transferable and Interpretable Latent Structure", "authors": "Xueliang Zhao, Lemao Liu, Tingchen Fu, Shuming Shi, Dongyan Zhao and Rui Yan", "abstract": "With the availability of massive general-domain dialogue data, pre-trained dialogue generation appears to be super appealing to transfer knowledge from the general domain to downstream applications. In most existing work, such transferable ability is mainly obtained by fitting a large model with hundreds of millions of parameters on massive data in an exhaustive way, leading to inefficient running and poor interpretability. This paper proposes a novel dialogue generation model with a latent structure that is easily transferable from the general domain to downstream tasks in a lightweight and transparent way. Experiments on two benchmarks validate the effectiveness of the proposed model. Thanks to the transferable latent structure, our model is able to yield better dialogue responses than four strong baselines in terms of both automatic and human evaluations, and our model with about 22% parameters particularly delivers a 5x speedup in running time compared with the strongest baseline. Moreover, the proposed model is explainable by interpreting the discrete latent variables.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.450122356414795, 7.52590274810791], "id": 3770, "title": "An Empirical Revisiting of Linguistic Knowledge Fusion in Language Understanding Tasks", "authors": "Changlong Yu, Tianyi Xiao, Lingpeng Kong, Yangqiu Song and Wilfred Ng", "abstract": "Though linguistic knowledge emerges during large-scale language model pretraining, recent work attempt to explicitly incorporate human-defined linguistic priors into task-specific fine-tuning. Infusing language models with syntactic or semantic knowledge from parsers has shown improvements on many language understanding tasks. To further investigate the effectiveness of structural linguistic priors, we conduct empirical study of replacing parsed graphs or trees with trivial ones (rarely carrying linguistic knowledge e.g., balanced tree) for tasks in the GLUE benchmark. Encoding with trivial graphs achieves competitive or even better performance in fully-supervised and few-shot settings. It reveals that the gains might not be significantly attributed to explicit linguistic priors but rather to more feature interactions brought by fusion layers. Hence we call for attention to using trivial graphs as necessary baselines to design advanced knowledge fusion methods in the future.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.856321334838867, 3.4646196365356445], "id": 3772, "title": "Unsupervised Non-transferable Text Classification", "authors": "Guangtao Zeng and Wei Lu", "abstract": "Training a good deep learning model requires substantial data and computing resources, which makes the resulting neural model a valuable intellectual property. To prevent the neural network from being undesirably exploited, non-transferable learning has been proposed to reduce the model generalization ability in specific target domains. However, existing approaches require labeled data for the target domain which can be difficult to obtain. Furthermore, they do not have the mechanism to still recover the model's ability to access the target domain.\nIn this paper, we propose a novel unsupervised non-transferable learning method for the text classification task that does not require annotated target domain data. We further introduce a secret key component in our approach for recovering the access to the target domain, where we design both an explicit and an implicit method for doing so. Extensive experiments demonstrate the effectiveness of our approach.", "track": "NLP Applications", "label": 0}, {"loc": [6.381520748138428, 12.14233684539795], "id": 3783, "title": "Adaptive Contrastive Learning on Multimodal Transformer for Review Helpfulness Prediction", "authors": "Thong Nguyen, Xiaobao Wu, Anh Tuan Luu, Zhen Hai and Lidong Bing", "abstract": "Modern Review Helpfulness Prediction systems are dependent upon multiple modalities, typically texts and images. Unfortunately, those contemporary approaches pay scarce attention to polish representations of cross-modal relations and tend to suffer from inferior optimization. This might cause harm to model's predictions in numerous cases. To overcome the aforementioned issues, we propose Multi-modal Contrastive Learning for Multimodal Review Helpfulness Prediction (MRHP) problem, concentrating on mutual information between input modalities to explicitly elaborate cross-modal relations. In addition, we introduce Adaptive Weighting scheme for our contrastive learning approach in order to increase flexibility in optimization. Lastly, we propose Multimodal Interaction module to address the unalignment nature of multimodal data, thereby assisting the model in producing more reasonable multimodal representations. Experimental results show that our method outperforms prior baselines and achieves state-of-the-art results on two publicly available benchmark datasets for MRHP problem.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [10.06380844116211, 6.84252405166626], "id": 3792, "title": "Adaptive Token-level Cross-lingual Feature Mixing for Multilingual Neural Machine Translation", "authors": "Junpeng Liu, Kaiyu Huang, Jiuyi Li, Huan Liu, Jinsong Su and Degen Huang", "abstract": "Multilingual neural machine translation aims to translate multiple language pairs in a single model and has shown great success thanks to the knowledge transfer across languages with the shared parameters. Despite promising, this share-all paradigm suffers from insufficient ability to capture language-specific features. Currently, the common practice is to insert or search language-specific networks to balance the shared and specific features. However, those two types of features are not sufficient enough to model the complex commonality and divergence across languages, such as the locally shared features among similar languages, which leads to sub-optimal transfer, especially in massively multilingual translation. In this paper, we propose a novel token-level feature mixing method that enables the model to capture different features and dynamically determine the feature sharing across languages. Based on the observation that the tokens in the multilingual model are usually shared by different languages, we we insert a feature mixing layer into each Transformer sublayer and model each token representation as a mix of different features, with a proportion indicating its feature preference. In this way, we can perform fine-grained feature sharing and achieve better multilingual transfer. Experimental results on multilingual datasets show that our method outperforms various strong baselines and can be extended to zero-shot translation. Further analyses reveal that our method can capture different linguistic features and bridge the representation gap across languages.", "track": "Machine Translation", "label": 10}, {"loc": [0.6801319122314453, 7.780095100402832], "id": 3794, "title": "A Dataset for Hyper-Relational Extraction and a Cube-Filling Approach", "authors": "Yew Ken Chia, Lidong Bing, Sharifah Mahani Aljunied, Luo Si and Soujanya Poria", "abstract": "Relation extraction has the potential for large-scale knowledge graph construction, but current methods do not consider the qualifier attributes for each relation triplet, such as time, quantity or location. The qualifiers form hyper-relational facts which better capture the rich and complex knowledge graph structure. For example, the relation triplet (Leonard Parker, Educated At, Harvard University) can be factually enriched by including the qualifier (End Time, 1967). Hence, we propose the task of hyper-relational extraction to extract more specific and complete facts from text. To support the task, we construct HyperRED, a large-scale and general-purpose dataset. Existing models cannot perform hyper-relational extraction as it requires a model to consider the interaction between three entities. Hence, we propose CubeRE, a cube-filling model inspired by table-filling approaches and explicitly considers the interaction between relation triplets and qualifiers. To improve model scalability and reduce negative class imbalance, we further propose a cube-pruning method. Our experiments show that CubeRE outperforms strong baselines and reveal possible directions for future research. Our code and data are available at github.com/declare-lab/HyperRED.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.189095973968506, 12.562711715698242], "id": 3796, "title": "Low-resource Neural Machine Translation with Cross-modal Alignment", "authors": "Zhe Yang, Qingkai Fang and Yang Feng", "abstract": "How to achieve neural machine translation with limited parallel data? Existing techniques often rely on large-scale monolingual corpus, which is impractical for some low-resource languages. In this paper, we turn to connect several low-resource languages to a particular high-resource one by additional visual modality. Specifically, we propose a cross-modal contrastive learning method to learn a shared space for all languages, where both a coarse-grained sentence-level objective and a fine-grained token-level one are introduced. Experimental results and further analysis show that our method can effectively learn the cross-modal and cross-lingual alignment with a small amount of image-text pairs, and achieves significant improvements over the text-only baseline under both zero-shot and few-shot scenarios.", "track": "Machine Translation", "label": 10}, {"loc": [8.050834655761719, 8.721470832824707], "id": 3798, "title": "Prompt-based Distribution Alignment for Domain Generalization in Text Classification", "authors": "Chen Jia and Yue Zhang", "abstract": "Prompt-based learning (a.k.a. prompting) achieves high performance by bridging the gap between the objectives of language modeling and downstream tasks. Domain generalization ability can be improved by prompting since classification across different domains can be unified into the prediction of the same set of label words. The remaining challenge for domain generalization by prompting comes from discrepancies between the data distribution of different domains. To improve domain generalization with prompting, we learn distributional invariance across source domains via two alignment regularization loss functions. The first is vocabulary distribution alignment, which uses a Kullback-Leibler divergence regularization on source-domain vocabulary distributions. The second is feature distribution alignment, which uses a novel adversarial training strategy to learn domain invariant representation across source domains. Experiments on sentiment analysis and natural language inference show the effectiveness of our method and achieve state-of-the-art results on six datasets.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [3.466515302658081, 4.555367946624756], "id": 3842, "title": "Two is Better than Many? Binary Classification as an Effective Approach to Multi-Choice Question Answering", "authors": "Deepanway Ghosal, Navonil Majumder, Rada Mihalcea and Soujanya Poria", "abstract": "We propose a simple refactoring of multi-choice question answering (MCQA) tasks as a series of binary classifications. The MCQA task is generally performed by scoring each (question, answer) pair normalized over all the pairs, and then selecting the answer from the pair that yield the highest score. For n answer choices, this is equivalent to an n-class classification setup where only one class (true answer) is correct. We instead show that classifying (question, true answer) as positive instances and (question, false answer) as negative instances is significantly more effective across various models and datasets. We show the efficacy of our proposed approach in different tasks -- abductive reasoning, commonsense question answering, science question answering, and sentence completion. Our DeBERTa binary classification model reaches the top or close to the top performance on public leaderboards for these tasks. The source code of the proposed approach is available at https://github.com/declare-lab/TEAM.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [3.7342336177825928, 9.956724166870117], "id": 3845, "title": "HEGEL: Hypergraph Transformer for Long Document Summarization", "authors": "Haopeng Zhang, Xiao Liu and Jiawei Zhang", "abstract": "Extractive summarization for long documents is challenging due to the extended structured input context. The long-distance sentence dependency hinders cross-sentence relations modeling, the critical step of extractive summarization. This paper proposes HEGEL, a hypergraph neural network for long document summarization by capturing high-order cross-sentence relations. HEGEL updates and learns effective sentence representations with hypergraph transformer layers and fuses different types of sentence dependencies, including latent topics, keywords coreference, and section structure. We validate HEGEL by conducting extensive experiments on two benchmark datasets, and experimental results demonstrate the effectiveness and efficiency of HEGEL.", "track": "Summarization", "label": 14}, {"loc": [8.202272415161133, 8.762166023254395], "id": 3848, "title": "Adapting a Language Model While Preserving its General Knowledge", "authors": "Zixuan Ke, Yijia Shao, Haowei Lin, Hu Xu, Lei Shu and Bing Liu", "abstract": "Domain-adaptive pre-training (or DA-training for short), also known as post-training, aims\nto train a pre-trained general-purpose language model (LM) using an unlabeled corpus of a\nparticular domain to adapt the LM so that end-tasks in the domain can give improved performances. However, existing DA-training methods are in some sense blind as they do not explicitly identify what knowledge in the LM should be preserved and what should be changed by the domain corpus. This paper shows that the existing methods are suboptimal and proposes a novel method to perform a more informed adaptation of the knowledge in the LM by (1) soft-masking the attention heads based on their importance to best preserve the general knowledge in the LM and (2) contrasting the representations of the general and the full (both general and domain knowledge) to learn an integrated representation with both general and domain-specific knowledge. Experimental results will demonstrate the effectiveness of the proposed approach.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.8815696239471436, 9.929330825805664], "id": 3850, "title": "Human Guided Exploitation of Interpretable Attention Patterns in Summarization and Topic Segmentation", "authors": "Raymond Li, Wen Xiao, Linzi Xing, Lanjun Wang, Gabriel Murray and Giuseppe Carenini", "abstract": "The multi-head self-attention mechanism of the transformer model has been thoroughly investigated recently. In one vein of study, researchers are interested in understanding why and how transformers work. In another vein, researchers propose new attention augmentation methods to make transformers more accurate, efficient and interpretable. In this paper, we combine these two lines of research in a human-in-the-loop pipeline to first discover important task-specific attention patterns. Then those patterns are injected, not only to smaller models, but also to the original model. The benefits of our pipeline and discovered patterns are demonstrated in two case studies with extractive summarization and topic segmentation. After discovering interpretable patterns in BERT-based models fine-tuned for the two downstream tasks, experiments indicate that when we inject the patterns into attention heads, the models show considerable improvements in accuracy and efficiency.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.120949745178223, 9.335439682006836], "id": 3858, "title": "Continual Training of Language Models for Few-Shot Learning", "authors": "Zixuan Ke, Haowei Lin, Yijia Shao, Hu Xu, Lei Shu and Bing Liu", "abstract": "Recent work on applying large language models (LMs) achieves impressive performance in many NLP applications. Adapting or posttraining an LM using an unlabeled domain corpus can produce even better performance for end-tasks in the domain. This paper proposes the problem of continually extending an LM by incrementally post-train the LM with a sequence of unlabeled domain corpora to expand its knowledge without forgetting its previous skills. The goal is to improve the few-shot end-task learning in these domains. The resulting system is called CPT (Continual PostTraining), which to our knowledge, is the first continual post-training system. Experimental results verify its effectiveness.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [8.04325008392334, 9.715804100036621], "id": 3877, "title": "Dictionary-Assisted Supervised Contrastive Learning", "authors": "Patrick Y. Wu, Richard Bonneau, Joshua Tucker and Jonathan Nagler", "abstract": "Text analysis in the social sciences often involves using specialized dictionaries to reason with abstract concepts, such as perceptions about the economy or abuse on social media. These dictionaries allow researchers to impart domain knowledge and note subtle usages of words relating to a concept(s) of interest. We introduce the dictionary-assisted supervised contrastive learning (DASCL) objective, allowing researchers to leverage specialized dictionaries when fine-tuning pretrained language models. The text is first keyword simplified: a common, fixed token replaces any word in the corpus that appears in the dictionary(ies) relevant to the concept of interest. During fine-tuning, a supervised contrastive objective draws closer the embeddings of the original and keyword-simplified texts of the same class while pushing further apart the embeddings of different classes. The keyword-simplified texts of the same class are more textually similar than their original text counterparts, which additionally draws the embeddings of the same class closer together. Combining DASCL and cross-entropy improves classification performance metrics in few-shot learning settings and social science applications compared to using cross-entropy alone and alternative contrastive and data augmentation methods.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [8.573674201965332, 7.906438827514648], "id": 3888, "title": "Fine-Tuning Pre-trained Transformers into Decaying Fast Weights", "authors": "Huanru Henry Mao", "abstract": "Autoregressive Transformers are strong language models but incur O(T) complexity during per-token generation due to the self-attention mechanism. Recent work proposes kernel-based methods to approximate causal self-attention by replacing it with recurrent formulations with various update rules and feature maps to achieve O(1) time and memory complexity. We explore these approaches and find that they are unnecessarily complex, and propose a simple alternative - decaying fast weights - that runs fast on GPU, outperforms prior methods, and retains 99% of attention's performance for GPT-2. We also show competitive performance on WikiText-103 against more complex attention substitutes.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [9.666714668273926, 6.312001705169678], "id": 3891, "title": "PRO-CS : An Instance-Based Prompt Composition Technique for Code-Switched Tasks", "authors": "SRIJAN BANSAL, Suraj Tripathi, Sumit Agarwal, Teruko Mitamura and Eric Nyberg", "abstract": "Code-switched (CS) data is ubiquitous in today's globalized world, but the dearth of annotated datasets in code-switching poses a significant challenge for learning diverse tasks across different language pairs. Parameter-efficient prompt-tuning approaches conditioned on frozen language models have shown promise for transfer learning in limited-resource setups. In this paper, we propose a novel instance-based prompt composition technique, PRO-CS, for CS tasks that combine language and task knowledge. We compare our approach with prompt-tuning and fine-tuning for code-switched tasks on 10 datasets across 4 language pairs. Our model outperforms the prompt-tuning approach by significant margins across all datasets and outperforms or remains at par with fine-tuning by using just 0.18% of total parameters. We also achieve competitive results when compared with the fine-tuned model in the low-resource cross-lingual and cross-task setting, indicating the effectiveness of our approach to incorporate new code-switched tasks.", "track": "Multilinguality", "label": 13}, {"loc": [3.865253210067749, 9.926995277404785], "id": 3906, "title": "SentBS: Sentence-level Beam Search for Controllable Summarization", "authors": "Chenhui Shen, Liying Cheng, Lidong Bing, Yang You and Luo Si", "abstract": "A wide range of control perspectives have been explored in controllable text generation. Structure-controlled summarization is recently proposed as a useful and interesting research direction. However, current structure-controlling methods have limited effectiveness in enforcing the desired structure. To address this limitation, we propose a sentence-level beam search generation method (SentBS), where evaluation is conducted throughout the generation process to select suitable sentences for subsequent generations. We experiment with different combinations of decoding methods to be used as sub-components by SentBS and evaluate results on the structure-controlled dataset MReD. Experiments show that all explored combinations for SentBS can improve the agreement between the generated text and the desired structure, with the best method significantly reducing the structural discrepancies suffered by the existing model, by approximately 68%.", "track": "Summarization", "label": 14}, {"loc": [8.11849594116211, 3.0038015842437744], "id": 3925, "title": "A Fine-grained Chinese Software Privacy Policy Dataset for Sequence Labeling and Regulation Compliant Identification", "authors": "Kaifa Zhao, Le Yu, Shiyao Zhou, Jing Li, Xiapu Luo, Yat Fei Aemon Chiu and Yutong Liu", "abstract": "Privacy protection raises great attention on both legal levels and user awareness. To protect user privacy, countries enact laws and regulations requiring software privacy policies to regulate their behavior. However, privacy policies are written in professional languages with many legal terms and software jargon that prevent users from understanding and even reading them. It is necessary and urgent to use NLP techniques to analyze privacy policies. However, existing datasets ignore law requirements and are limited to English. In this paper, we construct the first Chinese privacy policy dataset, namely CA4P-483, to facilitate the sequence labeling tasks and regulation compliance identification between privacy policies and software. Our dataset includes 483 Chinese Android application privacy policies, over 11K sentences, and 52K fine-grained annotations. We evaluate families of robust and representative baseline models on our dataset. Based on baseline performance, we provide findings and potential research directions on our dataset. Finally, we investigate the potential applications of CA4P-483 combing regulation requirements and program analysis.", "track": "Resources and Evaluation", "label": 1}, {"loc": [2.3505373001098633, 4.523759841918945], "id": 3926, "title": "Saving Dense Retriever from Shortcut Dependency in Conversational Search", "authors": "Sungdong Kim and Gangwoo Kim", "abstract": "Conversational search (CS) needs a holistic understanding of conversational inputs to retrieve relevant passages. In this paper, we demonstrate the existence of a \\textit{retrieval shortcut} in CS, which causes models to retrieve passages solely relying on partial history while disregarding the latest question. With in-depth analysis, we first show that naively trained dense retrievers heavily exploit the shortcut and hence perform poorly when asked to answer history-independent questions. To build more robust models against shortcut dependency, we explore various hard negative mining strategies. Experimental results show that training with the model-based hard negatives effectively mitigates the dependency on the shortcut, significantly improving dense retrievers on recent CS benchmarks. In particular, our retriever outperforms the previous state-of-the-art model by 11.0 in Recall@10 on QReCC.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [1.672010064125061, 5.309196949005127], "id": 3929, "title": "Graph-Induced Transformers for Efficient Multi-Hop Question Answering", "authors": "Giwon Hong, Jeonghwan Kim, Junmo Kang and Sung-Hyon Myaeng", "abstract": "A graph is a suitable data structure to represent the structural information of text. Recently, multi-hop question answering (MHQA) tasks, which require inter-paragraph/sentence linkages, have come to exploit such properties of a graph. Previous approaches to MHQA relied on leveraging the graph information along with the pre-trained language model (PLM) encoders. However, this trend exhibits the following drawbacks: (i) sample inefficiency while training in a low-resource setting; (ii) lack of reusability due to changes in the model structure or input. Our work proposes the Graph-Induced Transformer (GIT) that applies graph-derived attention patterns directly into a PLM, without the need to employ external graph modules. GIT can leverage the useful inductive bias of graphs while retaining the unperturbed Transformer structure and parameters. Our experiments on HotpotQA successfully demonstrate both the sample efficient characteristic of GIT and its capacity to replace the graph modules while preserving model performance.", "track": "Question Answering", "label": 11}, {"loc": [4.8394341468811035, 3.405543565750122], "id": 3930, "title": "DiscoSense: Commonsense Reasoning with Discourse Connectives", "authors": "Prajjwal Bhargava and Vincent Ng", "abstract": "We present DiscoSense, a benchmark for commonsense reasoning via understanding a wide variety of discourse connectives. We generate compelling distractors in DiscoSense using Conditional Adversarial Filtering, an extension of Adversarial Filtering that employs conditional generation. We show that state-of-the-art pre-trained language models struggle to perform well on DiscoSense, which makes this dataset ideal for evaluating next-generation commonsense reasoning systems.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.6899417042732239, 7.923322677612305], "id": 3933, "title": "Boosting Document-Level Relation Extraction by Mining and Injecting Logical Rules", "authors": "Shengda Fan, Shasha Mo and Jianwei Niu", "abstract": "Document-level relation extraction (DocRE) aims at extracting relations of all entity pairs in a document. A key challenge to DocRE lies in the complex interdependency between the relations of entity pairs. Unlike most prior efforts focusing on implicitly powerful representations, the recently proposed LogiRE (Ru et al., 2021) explicitly captures the interdependency by learning logical rules. However, LogiRE requires extra parameterized modules to reason merely after training backbones, and this disjointed optimization of backbones and extra modules may lead to sub-optimal results. In this paper, we propose MILR, a logic enhanced framework that boosts DocRE by Mining and Injecting Logical Rules. MILR first mines logical rules from annotations based on frequencies. Then in training, consistency regularization\nis leveraged as an auxiliary loss to penalize instances that violate mined rules. Finally, MILR infers from a global perspective based on integer programming. Compared with LogiRE, MILR does not introduce extra parameters and injects logical rules during both training and inference. Extensive experiments on two benchmarks demonstrate that MILR not only improves the relation extraction performance (1.1%-3.8% F1) but also makes predictions more logically consistent (over 4.5% Logic). More importantly, MILR also consistently outperforms LogiRE on both counts. Code is available at https://github.com/XingYing-stack/MILR.", "track": "Information Extraction", "label": 5}, {"loc": [5.069385051727295, 8.885894775390625], "id": 3937, "title": "MOCHA: A Multi-Task Training Approach for Coherent Text Generation from Cognitive Perspective", "authors": "Zhe Hu, Hou Pong Chan and Lifu Huang", "abstract": "Teaching neural models to generate narrative coherent texts is a critical problem. Recent pre-trained language models have achieved promising results, but there is still a gap between human written texts and machine-generated outputs. In this work, we propose a novel multi-task training strategy for long text generation grounded on the cognitive theory of writing, which empowers the model to learn essential subskills needed for writing including planning and reviewing besides end-to-end generation. \nWe extensively evaluate our model on three open-ended generation tasks including story generation, news article writing and argument generation. Experiments show that our model achieves better results on both few-shot and fully-supervised settings than strong baselines, and human evaluations confirm that our model can generate more coherent outputs.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.484869480133057, 9.078019142150879], "id": 3942, "title": "Variational Autoencoder with Disentanglement Priors for Low-Resource Task-Specific Natural Language Generation", "authors": "Zhuang Li, Lizhen Qu, Qiongkai Xu, Tongtong Wu, Tianyang Zhan and Gholamreza Haffari", "abstract": "In this paper, we propose a variational autoencoder with disentanglement priors, VAE-Dprior, for task-specific natural language generation with none or a handful of task-specific labeled examples. In order to tackle compositional generalization across tasks, our model performs disentangled representation learning by introducing a conditional prior for the latent content space and another conditional prior for the latent label space. Both types of priors satisfy a novel property called $\\epsilon$-disentangled. We show both empirically and theoretically that the novel priors can disentangle representations even without specific regularizations as in the prior work. The content prior enables directly sampling diverse content representations from the content space learned from the seen tasks, and fuse them with the representations of novel tasks for generating semantically diverse texts in the low-resource settings. Our extensive experiments demonstrate the superior performance of our model over competitive baselines in terms of i) data augmentation in continuous zero/few-shot learning, and ii) text style transfer in the few-shot setting.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [5.967228889465332, 12.417194366455078], "id": 3946, "title": "CISLR: Corpus for Indian Sign Language Recognition", "authors": "Abhinav Joshi, Ashwani Bhat, Pradeep S, Priya Gole, Shashwat Gupta, Shreyansh Agarwal and Ashutosh Modi", "abstract": "Indian Sign Language, though used by a diverse community, still lacks well-annotated resources for developing systems that would enable sign language processing. In recent years researchers have actively worked for sign languages like American Sign Languages, however, Indian Sign language is still far from data-driven tasks like machine translation. To address this gap, in this paper, we introduce a new dataset CISLR (Corpus for Indian Sign Language Recognition) for word-level recognition in Indian Sign Language using videos. The corpus has a large vocabulary of around 4700 words covering different topics and domains. Further, we propose a baseline model for word recognition from sign language videos. To handle the low resource problem in the Indian Sign Language, the proposed model consists of a prototype-based one-shot learner that leverages resource rich American Sign Language to learn generalized features for improving predictions in Indian Sign Language. Our experiments show that gesture features learned in another sign language can help perform one-shot predictions in CISLR.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.80941104888916, 9.3197660446167], "id": 3951, "title": "Mask the Correct Tokens: An Embarrassingly Simple Approach for Error Correction", "authors": "Kai Shen, Yichong Leng, Xu Tan, Siliang Tang, Yuan Zhang, Wenjie Liu and Edward Lin", "abstract": "Text error correction aims to correct the errors in text sequences such as those typed by humans or generated by speech recognition models.\nPrevious error correction methods usually take the source (incorrect) sentence as encoder input and generate the target (correct) sentence through the decoder. Since the error rate of the incorrect sentence is usually low (e.g., 10\\%), the correction model can only learn to correct on limited error tokens but trivially copy on most tokens (correct tokens), which harms the effective training of error correction. In this paper, we argue that the correct tokens should be better utilized to facilitate effective training and then propose a simple yet effective masking strategy to achieve this goal.\nSpecifically, we randomly mask out a part of the correct tokens in the source sentence and let the model learn to not only correct the original error tokens but also predict the masked tokens based on their context information. Our method enjoys several advantages: 1) it alleviates trivial copy; 2) it leverages effective training signals from correct tokens; 3) it is a plug-and-play module and can be applied to different models and tasks. Experiments on spelling error correction and speech recognition error correction on Mandarin datasets and grammar error correction on English datasets with both autoregressive and non-autoregressive generation models show that our method improves the correction\naccuracy consistently.", "track": "NLP Applications", "label": 0}, {"loc": [8.0003080368042, 9.69007682800293], "id": 3953, "title": "AMAL: Meta Knowledge-Driven Few-Shot Adapter Learning", "authors": "S. K. Hong and Tae Young Jang", "abstract": "NLP has advanced greatly together with the proliferation of Transformer-based pre-trained language models. To adapt to a downstream task, the pre-trained language models need to be fine-tuned with a sufficient supply of annotated examples. In recent years, Adapter-based fine-tuning methods have expanded the applicability of pre-trained language models by substantially lowering the required amount of annotated examples. However, existing Adapter-based methods still fail to yield meaningful results in the few-shot regime where only a few annotated examples are provided. In this study, we present a meta-learning-driven low-rank adapter pooling method, called AMAL, for leveraging pre-trained language models even with just a few data points. We evaluate our method on five text classification benchmark datasets. The results show that AMAL significantly outperforms previous few-shot learning methods and achieves a new state-of-the-art.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [6.883059978485107, 6.2786688804626465], "id": 3956, "title": "Discourse Context Predictability Effects in Hindi Word Order", "authors": "Sidharth Ranjan, Marten van Schijndel, Sumeet Agarwal and Rajakrishnan Rajkumar", "abstract": "We test the hypothesis that discourse predictability influences Hindi syntactic choice. While prior work has shown that a number of factors (e.g., information status, dependency length, and syntactic surprisal) influence Hindi word order preferences, the role of discourse predictability is underexplored in the literature. Inspired by prior work on syntactic priming, we investigate how the words and syntactic structures in a sentence influence the word order of the following sentences. Specifically, we extract sentences from the Hindi-Urdu Treebank corpus (HUTB), permute the preverbal constituents of those sentences, and build a classifier to predict which sentences actually occurred in the corpus against artificially generated distractors. The classifier uses a number of discourse-based features and cognitive features to make its predictions, including dependency length, surprisal, and information status. We find that information status and LSTM-based discourse predictability influence word order choices, especially for non-canonical object-fronted orders. We conclude by situating our results within the broader syntactic priming literature.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [5.22320032119751, 5.23854398727417], "id": 3970, "title": "\"Covid vaccine is against Covid but Oxford vaccine is made at Oxford!\" Semantic Interpretation of Proper Noun Compounds", "authors": "Keshav Kolluru, Gabriel Stanovsky and Mausam", "abstract": "Proper noun compounds, e.g., \"Covid vaccine\", convey information in a succinct manner (a \"Covid vaccine\" is a \"vaccine that immunizes against the Covid disease\"). These are commonly used in short-form domains, such as news headlines, but are largely ignored in information-seeking applications. To address this limitation, we release a new manually annotated dataset, ProNCI, consisting of 22.5K proper noun compounds along with their free-form semantic interpretations. ProNCI is 60 times larger than prior noun compound datasets and also includes non-compositional examples, which have not been previously explored. We experiment with various neural models for automatically generating the semantic interpretations from proper noun compounds, ranging from few-shot prompting to supervised learning, with varying degrees of knowledge about the constituent nouns. We find that adding targeted knowledge, particularly about the common noun, results in performance gains of upto 2.8%. Finally, we integrate our model generated interpretations with an existing Open IE system and observe an 7.5% increase in yield at a precision of 85%. The dataset and code are available at https://github.com/dair-iitd/pronci.", "track": "Information Extraction", "label": 5}, {"loc": [6.933262825012207, 6.333314895629883], "id": 3981, "title": "Context Limitations Make Neural Language Models More Human-Like", "authors": "Tatsuki Kuribayashi, Yohei Oseki, Ana Brassard and Kentaro Inui", "abstract": "Language models (LMs) have been used in cognitive modeling as well as engineering studies---they compute information-theoretic complexity metrics that simulate humans' cognitive load during reading.\nThis study highlights a limitation of modern neural LMs as the model of choice for this purpose: there is a discrepancy between their context access capacities and that of humans.\nOur results showed that constraining the LMs' context access improved their simulation of human reading behavior.\nWe also showed that LM-human gaps in context access were associated with specific syntactic constructions; incorporating syntactic biases into LMs' context access might enhance their cognitive plausibility.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [2.8015122413635254, 7.134524822235107], "id": 3983, "title": "A Generative Model for End-to-End Argument Mining with Reconstructed Positional Encoding and Constrained Pointer Mechanism", "authors": "Jianzhu Bao, Yuhang He, Yang Sun, Bin Liang, Jiachen Du, Bing Qin, Min Yang and Ruifeng Xu", "abstract": "Argument mining (AM) is a challenging task as it requires recognizing the complex argumentation structures involving multiple subtasks.\nTo handle all subtasks of AM in an end-to-end fashion, previous works generally transform AM into a dependency parsing task.\nHowever, such methods largely require complex pre- and post-processing to realize the task transformation.\nIn this paper, we investigate the end-to-end AM task from a novel perspective by proposing a generative framework, in which the expected outputs of AM are framed as a simple target sequence. \nThen, we employ a pre-trained sequence-to-sequence language model with a constrained pointer mechanism (CPM) to model the clues for all the subtasks of AM in the light of the target sequence. Furthermore, we devise a reconstructed positional encoding (RPE) to alleviate the order biases induced by the autoregressive generation paradigm.\nExperimental results show that our proposed framework achieves new state-of-the-art performance on two AM benchmarks.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [4.174262046813965, 7.014216899871826], "id": 3997, "title": "Reflect, Not Reflex: Inference-Based Common Ground Improves Dialogue Response Quality", "authors": "Pei Zhou, Hyundong J. Cho, Pegah Jandaghi, Dong-Ho Lee, Bill Yuchen Lin, Jay Pujara and Xiang Ren", "abstract": "Human communication relies on common ground (CG), the mutual knowledge and beliefs shared by participants, to produce coherent and interesting conversations. In this paper, we demonstrate that current response generation (RG) models produce generic and dull responses in dialogues because they act reflexively, failing to explicitly model CG, both due to the lack of CG in training data and the standard RG training procedure. We introduce Reflect, a dataset that annotates dialogues with explicit CG (materialized as inferences approximating shared knowledge and beliefs) and solicits 9k diverse human-generated responses each following one common ground. Using Reflect, we showcase the limitations of current dialogue data and RG models: less than half of the responses in current data is rated as high quality (sensible, specific, and interesting) and models trained using this data have even lower quality, while most Reflect responses are judged high quality. Next, we analyze whether CG can help models produce better quality responses by using Reflect CG to guide RG models. Surprisingly, we find that simply prompting GPT3 to \"think\" about CG generates 30% more quality responses, showing promising benefits to integrating CG into the RG process.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.272122383117676, 7.322163105010986], "id": 4002, "title": "FlowEval: A Consensus-Based Dialogue Evaluation Framework Using Segment Act Flows", "authors": "Jianqiao Zhao, Yanyang Li, Wanyu Du, Yangfeng Ji, Dong Yu, Michael Lyu and Liwei Wang", "abstract": "Despite recent progress in open-domain dialogue evaluation, how to develop automatic metrics remains an open problem. We explore the potential of dialogue evaluation featuring dialog act information, which was hardly explicitly modeled in previous methods. However, defined at the utterance level in general, dialog act is of coarse granularity, as an utterance can contain multiple segments possessing different functions. Hence, we propose segment act, an extension of dialog act from utterance level to segment level, and crowdsource a large-scale dataset for it. To utilize segment act flows, sequences of segment acts, for evaluation, we develop the first consensus-based dialogue evaluation framework, FlowEval. This framework provides a reference-free approach for dialog evaluation by finding pseudo-references. Extensive experiments against strong baselines on three benchmark datasets demonstrate the effectiveness and other desirable characteristics of our FlowEval, pointing out a potential path for better dialogue evaluation.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.92603874206543, 12.239973068237305], "id": 4022, "title": "FaD-VLP: Fashion Vision-and-Language Pre-training towards Unified Retrieval and Captioning", "authors": "Suvir Mirchandani, Licheng Yu, Mengjiao Wang, Animesh Sinha, Wenwen Jiang, Tao Xiang and Ning Zhang", "abstract": "Multimodal tasks in the fashion domain have significant potential for e-commerce, but involve challenging vision-and-language learning problems\u2014e.g., retrieving a fashion item given a reference image plus text feedback from a user. Prior works on multimodal fashion tasks have either been limited by the data in individual benchmarks, or have leveraged generic vision-and-language pre-training but have not taken advantage of the characteristics of fashion data. Additionally, these works have mainly been restricted to multimodal understanding tasks. To address these gaps, we make two key contributions. First, we propose a novel fashion-specific pre-training framework based on weakly-supervised triplets constructed from fashion image-text pairs. We show the triplet-based tasks are an effective addition to standard multimodal pre-training tasks. Second, we propose a flexible decoder-based model architecture capable of both fashion retrieval and captioning tasks. Together, our model design and pre-training approach are competitive on a diverse set of fashion tasks, including cross-modal retrieval, image retrieval with text feedback, image captioning, relative image captioning, and multimodal categorization.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [6.141786575317383, 12.412346839904785], "id": 4024, "title": "MM-Align: Learning Optimal Transport-based Alignment Dynamics for Fast and Accurate Inference on Missing Modality Sequences", "authors": "Wei Han, Hui Chen, Min-Yen Kan and Soujanya Poria", "abstract": "Existing multimodal tasks mostly target at the complete input modality setting, i.e., each modality is either complete or completely missing in both training and test sets. However, the randomly missing situations have still been underexplored. In this paper, we present a novel approach named MM-Align to address the missing-modality inference problem. Concretely, we propose 1) an alignment dynamics learning module based on the theory of optimal transport (OT) for missing data imputation; 2) a denoising training algorithm to enhance the quality of imputation as well as the accuracy of model predictions. Compared with previous generative methods which devote to restoring the missing inputs, MM-Align learns to capture and imitate the alignment dynamics between modality sequences. Results of comprehensive experiments on two multimodal tasks empirically demonstrate that our method can perform more accurate and faster inference and alleviate the overfitting issue under different missing conditions.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [3.489875555038452, 4.533748626708984], "id": 4038, "title": "Evaluating the Knowledge Dependency of Questions", "authors": "Hyeongdon Moon, Yoonseok Yang, Hangyeol Yu, Seunghyun Lee, Myeongho Jeong, juneyoung park, Jamin Shin, Minsam Kim and Seungtaek Choi", "abstract": "The automatic generation of Multiple Choice Questions (MCQ) has the potential to reduce the time educators spend on student assessment significantly. However, existing evaluation metrics for MCQ generation, such as BLEU, ROUGE, and METEOR, focus on the n-gram based similarity of the generated MCQ to the gold sample in the dataset and disregard their educational value.\nThey fail to evaluate the MCQ's ability to assess the student's knowledge of the corresponding target fact. To tackle this issue, we propose a novel automatic evaluation metric, coined Knowledge Dependent Answerability (KDA), which measures the MCQ's answerability given knowledge of the target fact. Specifically, we first show how to measure KDA based on student responses from a human survey.\nThen, we propose two automatic evaluation metrics, KDA_disc and KDA_cont, that approximate KDA by leveraging pre-trained language models to imitate students' problem-solving behavior.\nThrough our human studies, we show that KDA_disc and KDA_soft have strong correlations with both (1) KDA and (2) usability in an actual classroom setting, labeled by experts. Furthermore, when combined with n-gram based similarity metrics, KDA_disc and KDA_cont are shown to have a strong predictive power for various expert-labeled MCQ quality measures.", "track": "Resources and Evaluation", "label": 1}, {"loc": [0.4601297080516815, 7.090707778930664], "id": 4044, "title": "MoSE: Modality Split and Ensemble for Multimodal Knowledge Graph Completion", "authors": "Yu Zhao, Xiangrui Cai, Yike Wu, Haiwei Zhang, Ying Zhang, Guoqing Zhao and Ning Jiang", "abstract": "Multimodal knowledge graph completion (MKGC) aims to predict missing entities in MKGs. Previous works usually share relation representation across modalities. This results in mutual interference between modalities during training, since for a pair of entities, the relation from one modality probably contradicts that from another modality. Furthermore, making a unified prediction based on the shared relation representation treats the input in different modalities equally, while their importance to the MKGC task should be different. In this paper, we propose MoSE, a Modality Split representation learning and Ensemble inference framework for MKGC. Specifically, in the training phase, we learn modality-split relation embeddings for each modality instead of a single modality-shared one, which alleviates the modality interference. Based on these embeddings, in the inference phase, we first make modality-split predictions and then exploit various ensemble methods to combine the predictions with different weights, which models the modality importance dynamically. Experimental results on three KG datasets show that MoSE outperforms state-of-the-art MKGC methods. Codes are available at https://github.com/OreOZhao/MoSE4MKGC.", "track": "NLP Applications", "label": 0}, {"loc": [10.188883781433105, 7.005106449127197], "id": 4059, "title": "Entropy-Based Vocabulary Substitution for Incremental Learning in Multilingual Neural Machine Translation", "authors": "Kaiyu Huang, Peng Li, Jin Ma and Yang Liu", "abstract": "In a practical real-world scenario, the longstanding goal is that a universal multilingual translation model can be incrementally updated when new language pairs arrive. Specifically, the initial vocabulary only covers some of the words in new languages, which hurts the translation quality for incremental learning. Although existing approaches attempt to address this issue by replacing the original vocabulary with a rebuilt vocabulary or constructing independent language-specific vocabularies, these methods can not meet the following three demands simultaneously: (1) High translation quality for original and incremental languages, (2) low cost for model training, (3) low time overhead for preprocessing. In this work, we propose an entropy-based vocabulary substitution (EVS) method that just needs to walk through new language pairs for incremental learning in a large-scale multilingual data updating while remaining the size of the vocabulary. Our method has access to learn new knowledge from updated training samples incrementally while keeping high translation quality for original language pairs, alleviating the issue of catastrophic forgetting. Results of experiments show that EVS can achieve better performance and save excess overhead for incremental learning in the multilingual machine translation task.", "track": "Machine Translation", "label": 10}, {"loc": [4.077825546264648, 6.998421669006348], "id": 4070, "title": "Eliciting Knowledge from Large Pre-Trained Models for Unsupervised Knowledge-Grounded Conversation", "authors": "Yanyang Li, Jianqiao Zhao, Michael Lyu and Liwei Wang", "abstract": "Recent advances in large-scale pre-training provide large models with the potential to learn knowledge from the raw text. It is thus natural to ask whether it is possible to leverage these large models as knowledge bases for downstream tasks. In this work, we answer the aforementioned question in unsupervised knowledge-grounded conversation. We explore various methods that best elicit knowledge from large models. Our human study indicates that, though hallucinations exist, large models post the unique advantage of being able to output common sense and summarize facts that cannot be directly retrieved from the search engine. To better exploit such generated knowledge in dialogue generation, we treat the generated knowledge as a noisy knowledge source and propose the posterior-based reweighing as well as the noisy training strategy. Empirical results on two benchmarks show advantages over the state-of-the-art methods.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [7.050638675689697, 5.9171223640441895], "id": 4085, "title": "An Unsupervised, Geometric and Syntax-aware Quantification of Polysemy", "authors": "Anmol Goel, Charu Sharma and Ponnurangam Kumaraguru", "abstract": "Polysemy is the phenomenon where a single word form possesses two or more related senses. It is an extremely ubiquitous part of natural language and analyzing it has sparked rich discussions in the linguistics, psychology and philosophy communities alike. With scarce attention paid to polysemy in computational linguistics, and even scarcer attention toward quantifying polysemy, in this paper, we propose a novel, unsupervised framework to compute and estimate polysemy scores for words in multiple languages. We infuse our proposed quantification with syntactic knowledge in the form of dependency structures. This informs the final polysemy scores of the lexicon motivated by recent linguistic findings that suggest there is an implicit relation between syntax and ambiguity/polysemy. We adopt a graph based approach by computing the discrete Ollivier Ricci curvature on a graph of the contextual nearest neighbors. We test our framework on curated datasets controlling for different sense distributions of words in 3 typologically diverse languages - English, French and Spanish. The effectiveness of our framework is demonstrated by significant correlations of our quantification with expert human annotated language resources like WordNet. We observe a 0.3 point increase in the correlation coefficient as compared to previous quantification studies in English. Our research leverages contextual language models and syntactic structures to empirically support the widely held theoretical linguistic notion that syntax is intricately linked to ambiguity/polysemy.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [8.320326805114746, 6.569927215576172], "id": 4088, "title": "Reorder and then Parse, Fast and Accurate Discontinuous Constituency Parsing", "authors": "Kailai Sun, Zuchao Li and Hai Zhao", "abstract": "Discontinuous constituency parsing is still kept developing for its efficiency and accuracy are far behind its continuous counterparts. Motivated by the observation that a discontinuous constituent tree can be simply transformed into a pseudo-continuous one by artificially reordering words in the sentence, we propose a novel reordering method, thereby construct fast and accurate discontinuous constituency parsing systems working in continuous way. Specifically, we model the relative position changes of words as a list of actions. By parsing and performing this actions, the corresponding pseudo-continuous sequence is derived. Discontinuous parse tree can be further inferred via integrating a high-performance pseudo-continuous constituency parser. Our systems are evaluated on three classical discontinuous constituency treebanks, achieving new state-of-the-art on two treebanks and showing a distinct advantage in speed.", "track": "Syntax, Parsing and their Applications", "label": 23}, {"loc": [3.4574875831604004, 9.518741607666016], "id": 4102, "title": "Making Science Simple: Corpora for the Lay Summarisation of Scientific Literature", "authors": "Tomas Goldsack, Zhihao Zhang, Chenghua Lin and Carolina Scarton", "abstract": "Lay summarisation aims to jointly summarise and simplify a given text, thus making its content more comprehensible to non-experts.\nAutomatic approaches for lay summarisation can provide significant value in broadening access to scientific literature, enabling a greater degree of both interdisciplinary knowledge sharing and public understanding when it comes to research findings. However, current corpora for this task are limited in their size and scope, hindering the development of broadly applicable data-driven approaches. \nAiming to rectify these issues, we present two novel lay summarisation datasets, PLOS (large-scale) and eLife (medium-scale), each of which contains biomedical journal articles alongside expert-written lay summaries.\nWe provide a thorough characterisation of our lay summaries, \nhighlighting differing levels of readability and abstractiveness\nbetween datasets that can be leveraged to support the needs of different applications.\nFinally, we benchmark our datasets using mainstream summarisation approaches and perform a manual evaluation with domain experts, demonstrating their utility and casting light on the key challenges of this task.", "track": "Resources and Evaluation", "label": 1}, {"loc": [5.826259136199951, 5.1483612060546875], "id": 4106, "title": "Looking at the Overlooked: An Analysis on the Word-Overlap Bias in Natural Language Inference", "authors": "Sara Rajaee, Yadollah Yaghoobzadeh and Mohammad Taher Pilehvar", "abstract": "It has been shown that NLI models are usually biased with respect to the word-overlap between the premise and the hypothesis, as they take this feature as a primary cue for predicting the entailment label. \nIn this paper, we focus on an overlooked aspect of the overlap bias in the NLI models: the reverse word-overlap bias. \nOur experimental results demonstrate that current NLI systems are also highly biased towards the non-entailment label on instances with low overlap and that existing debiasing methods, which are reportedly successful on challenge datasets, are generally ineffective in addressing this category of bias.\nThrough a set of analyses, we investigate the reasons for the emergence of the overlap bias and the role of minority examples in mitigating this bias.\nFor the former, we find that the word overlap bias does not stem from pre-training, and in the latter, we observe that in contrast to the accepted assumption, eliminating minority examples does not affect the generalizability of debiasing methods with respect to the overlap bias.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.720785140991211, 8.399993896484375], "id": 4109, "title": "An Empirical Study on the Transferability of Transformer Modules in Parameter-efficient Fine-tuning", "authors": "Mohammad A. Tajari, Sara Rajaee and Mohammad Taher Pilehvar", "abstract": "Parameter-efficient fine-tuning has garnered lots of attention in recent studies.\nOn this subject, we investigate the capability of different transformer modules in transferring knowledge from a pre-trained model to a downstream task. \nOur empirical results suggest that every transformer module is a winning ticket such that fine-tuning the specific module while the rest of the network is frozen achieves a comparable performance to the full fine-tuning case. \nAmong different modules in LMs, LayerNorms exhibit a significant capacity for transfer learning to the extent that with only 0.003% updateable parameters in the layer-wise analysis, they can show acceptable performance on various target tasks.\nWe argue that the performance of LayerNorms could be attributed to their high-magnitude weights compared to other components in a pre-trained model.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [1.7835698127746582, 3.8761324882507324], "id": 4135, "title": "CODER: An efficient framework for improving retrieval through COntextual Document Embedding Reranking", "authors": "George Zerveas, Navid Rekabsaz, Daniel Cohen and Carsten Eickhoff", "abstract": "Contrastive learning has been the dominant approach to training dense retrieval models. In this work, we investigate the impact of ranking context - an often overlooked aspect of learning dense retrieval models. In particular, we examine the effect of its constituent parts: jointly scoring a large number of negatives per query, using retrieved (query-specific) instead of random negatives, and a fully list-wise loss.\n\nTo incorporate these factors into training, we introduce Contextual Document Embedding Reranking (CODER), a highly efficient retrieval framework. When reranking, it incurs only a negligible computational overhead on top of a first-stage method at run time (approx. 5 ms delay per query), allowing it to be easily combined with any state-of-the-art dual encoder method. Models trained through CODER can also be used as stand-alone retrievers.\n\nEvaluating CODER in a large set of experiments on the MS MARCO and TripClick collections, we show that the contextual reranking of precomputed document embeddings leads to a significant improvement in retrieval performance. This improvement becomes even more pronounced when more relevance information per query is available, shown in the TripClick collection, where we establish new state-of-the-art results by a large margin.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.674646377563477, 8.559362411499023], "id": 4141, "title": "AdapterShare: Task Correlation Modeling with Adapter Differentiation", "authors": "Zhi Chen, Bei Chen, Lu Chen, Kai Yu and Jian-Guang LOU", "abstract": "Thanks to the development of pre-trained language models, multitask learning (MTL) methods achieve a great success in natural language understanding area.\nHowever, current MTL methods pay more attention to task selection or model design to fuse as much knowledge as possible, while intrinsic task correlation is often neglected. It is important to learn sharing strategy among multiple tasks rather than sharing everything.\n%The MTL model is directly shared among all the tasks. \n%For example, in traditional MTL methods, the last classification layers or the decoder layers are manually separated. More deeply, \nIn this paper, we propose AdapterShare, an adapter differentiation method to explicitly model the task correlation among multiple tasks. AdapterShare is automatically learned based on the gradients on tiny held-out validation data. Compared to single-task learning and fully shared MTL methods, our proposed method obtains obvious performance improvement. Compared to the existing MTL method AdapterFusion, AdapterShare achieves absolute 1.90 average points improvement on five dialogue understanding tasks and 2.33 points gain on NLU tasks.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [7.705076694488525, 7.960129737854004], "id": 4146, "title": "Rethinking Task-Specific Knowledge Distillation: Contextualized Corpus as Better Textbook", "authors": "Chang Liu, Chongyang Tao, Jianxin Liang, Tao Shen, Jiazhan Feng, Quzhe Huang and Dongyan Zhao", "abstract": "Knowledge distillation has been proven effective when customizing small language models for specific tasks. Here, a corpus as `textbook' plays an indispensable role, only through which the teacher can teach the student. Prevailing methods adopt a two-stage distillation paradigm: general distillation first with task-agnostic general corpus and task-specific distillation next with augmented task-specific corpus. We argue that such a paradigm may not be optimal. In general distillation, it's extravagant to let the diverse but desultory general knowledge overwhelms the limited model capacity of the student. While in task-specific distillation, the task corpus is usually limited and narrow, preventing the student from learning enough knowledge. To mitigate the issues in the two gapped corpora, we present a better textbook for the student to learn: contextualized corpus that contextualizes task corpus with large-scale general corpus through relevance-based text retrieval. Experimental results on GLUE benchmark demonstrate that contextualized corpus is the better textbook compared with jointly using general corpus and augmented task-specific corpus. Surprisingly, it enables task-specific distillation from scratch without general distillation while maintaining comparable performance, making it more flexible to customize the student model with desired model size under various computation constraints.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [2.0560696125030518, 4.054901599884033], "id": 4154, "title": "Recovering Gold from Black Sand: Multilingual Dense Passage Retrieval with Hard and False Negative Samples", "authors": "Tianhao Shen, Mingtong Liu, Ming Zhou and Deyi Xiong", "abstract": "Negative samples have not been efficiently explored in multilingual dense passage retrieval. In this paper, we propose a novel multilingual dense passage retrieval framework, mHFN, to recover and utilize hard and false negative samples. mHFN consists of three key components: 1) a multilingual hard negative sample augmentation module that allows knowledge of indistinguishable passages to be shared across multiple languages and synthesizes new hard negative samples by interpolating representations of queries and existing hard negative samples, 2) a multilingual negative sample cache queue that stores negative samples from previous batches in each language to increase the number of multilingual negative samples used in training beyond the batch size limit, and 3) a lightweight adaptive false negative sample filter that uses generated pseudo labels to separate unlabeled false negative samples and converts them into positive passages in training. We evaluate mHFN on Mr. TyDi, a high-quality multilingual dense passage retrieval dataset covering eleven typologically diverse languages, and experimental results show that mHFN outperforms strong sparse, dense and hybrid baselines and achieves new state-of-the-art performance on all languages. Our source code is available at https://github.com/Magnetic2014/mHFN.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [5.873255729675293, 5.7732672691345215], "id": 4155, "title": "The \"Problem\u201d of Human Label Variation: On Ground Truth in Data, Modeling and Evaluation", "authors": "Barbara Plank", "abstract": "Human variation in labeling is often considered noise. Annotation projects for machine learning (ML) aim at minimizing human label variation, with the assumption to maximize data quality and in turn optimize and maximize machine learning metrics. However, this\nconventional practice assumes that there exists a *ground truth*, and neglects that there exists genuine human variation in labeling due to disagreement, subjectivity in annotation or multiple plausible answers.\nIn this position paper, we argue that this big open problem of \\textit{human label variation} persists and critically needs more attention to move our field forward. This is because human label variation impacts all stages of the ML pipeline: *data, modeling and evaluation*. However, few works consider all of these dimensions jointly; and existing research is fragmented. We reconcile different previously proposed notions of human label variation, provide a repository of publicly-available datasets with un-aggregated labels, depict approaches proposed so far, identify gaps and suggest ways forward. As datasets are becoming increasingly available, we hope that this synthesized view on the ``problem'' will lead to an open discussion on possible strategies to devise fundamentally new directions.", "track": "Theme Track", "label": 18}, {"loc": [10.87709903717041, 6.870487213134766], "id": 4176, "title": "Quality Scoring of Source Words in Neural Translation Models", "authors": "Priyesh Jain, Sunita Sarawagi and Tushar Tomar", "abstract": "Word-level quality scores on input source sentences can provide useful feedback to an end-user when translating into an unfamiliar target language. Recent approaches either require training special word-scoring models based on synthetic data or require repeated invocation of the translation model. We propose a simple approach based on comparing the difference of probabilities from two language models. The basic premise of our method is to reason how well each source word is explained by the target sentence as against the source language model. Our approach provides up to five points higher F1 scores and is significantly faster than the state of the art methods on three language pairs. Also, our method does not require training any new model. We release a public dataset on word omissions and mistranslations on a new language pair.", "track": "Machine Translation", "label": 10}, {"loc": [4.682380199432373, 7.255197048187256], "id": 4194, "title": "Pneg: Prompt-based Negative Response Generation for Dialogue Response Selection Task", "authors": "Nyoungwoo Lee, ChaeHun Park, Ho-Jin Choi and Jaegul Choo", "abstract": "In retrieval-based dialogue systems, a response selection model acts as a ranker to select the most appropriate response among several candidates. However, such selection models tend to rely on context-response content similarity, which makes models vulnerable to adversarial responses that are semantically similar but not relevant to the dialogue context. Recent studies have shown that leveraging these adversarial responses as negative training samples is useful for improving the discriminating power of the selection model. Nevertheless, collecting human-written adversarial responses is expensive, and existing synthesizing methods often have limited scalability. To overcome these limitations, this paper proposes a simple but efficient method for generating adversarial negative responses leveraging a large-scale language model. Experimental results on dialogue selection tasks show that our method outperforms other methods of synthesizing adversarial negative responses. These results suggest that our method can be an effective alternative to human annotators in generating adversarial responses. Our code and dataset will be released if the paper is accepted.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.816869020462036, 5.820551872253418], "id": 4197, "title": "Facilitating Contrastive Learning of Discourse Relational Senses by Exploiting the Hierarchy of Sense Relations", "authors": "Wanqiu Long and Bonnie Webber", "abstract": "Implicit discourse relation recognition is a challenging task that involves identifying the sense or senses that hold between two adjacent spans of text, in the absense of an explicit connective between them. In both PDTB-2 (prasad et al., 2008) and PDTB-3 (Webber et al., 2019), discourse relational senses are organized into a three-level hierarchy ranging from four broad top-level senses, to more specific senses below them. Most previous work on implicitf discourse relation recognition have used the sense hierarchy simply to indicate what sense labels were available. Here we do more --- incorporating the sense hierarchy into the recognition process itself and using it to select the negative examples used in contrastive learning. With no additional effort, the approach achieves state-of-the-art performance on the task. Our code is released in\nhttps://github.com/wanqiulong 0923/Contrastive\\_IDRR.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [7.287219047546387, 7.856777191162109], "id": 4199, "title": "Simplified Graph Learning for Inductive Short Text Classification", "authors": "Kaixin Zheng, Yaqing Wang, Quanming Yao and Dejing Dou", "abstract": "Short text classification (STC) is hard as short texts lack context information and labeled data is not enough. Graph neural networks obtain the state-of-the-art on STC since they can merge various auxiliary information via the message passing framework. However, existing works conduct transductive learning, which requires retraining to accommodate new samples and takes large memory. In this paper, we present SimpleSTC which handles inductive STC problem but only leverages words. We construct word graph from an external large corpus to compensate for the lack of semantic information, and learn text graph to handle the lack of labeled data. Results show that SimpleSTC obtains state-of-the-art performance with lower memory consumption and faster inference speed.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [9.276308059692383, 7.001067161560059], "id": 4202, "title": "Don't Stop Fine-Tuning: On Training Regimes for Few-Shot Cross-Lingual Transfer with Multilingual Language Models", "authors": "Fabian David Schmidt, Ivan Vuli\u0107 and Goran Glava\u0161", "abstract": "A large body of recent work highlights the fallacies of zero-shot cross-lingual transfer (ZS-XLT) with large multilingual language models. Namely, their performance varies substantially for different target languages and is the weakest where needed the most: for low-resource languages distant to the source language. One remedy is few-shot transfer (FS-XLT), where leveraging only a few task-annotated instances in the target language(s) may yield sizable performance gains. However, FS-XLT also succumbs to large variation, as models easily overfit to the small datasets. In this work, we present a systematic study focused on a spectrum of FS-XLT fine-tuning regimes, analyzing key properties such as effectiveness, (in)stability, and modularity. We conduct extensive experiments on both higher-level (NLI, paraphrasing) and lower-level tasks (NER, POS), presenting new FS-XLT strategies that yield both improved and more stable FS-XLT across the board. \nOur findings challenge established FS-XLT methods: e.g., we propose to replace sequential fine-tuning with joint fine-tuning on source and target language instances, offering consistent gains with different number of shots (including resource-rich scenarios). We also show that further gains can be achieved with multi-stage FS-XLT training in which joint multilingual fine-tuning precedes the bilingual source-target specialization.", "track": "Multilinguality", "label": 13}, {"loc": [6.511099338531494, 1.849408745765686], "id": 4226, "title": "Towards Compositional Generalization in Code Search", "authors": "Hojae Han, Seung-won Hwang, Shuai Lu, Nan Duan and Seungtaek Choi", "abstract": "We study compositional generalization, which aims to generalize on unseen combinations of seen structural elements, for code search. Unlike existing approaches of partially pursuing this goal, we study how to extract structural elements, which we name a template that directly targets compositional generalization. Thus we propose CTBERT, or Code Template BERT, representing codes using automatically extracted templates as building blocks. We empirically validate CTBERT on two public code search benchmarks, AdvTest and CSN. Further, we show that templates are complementary to data flow graphs in GraphCodeBERT, by enhancing structural context around variables.", "track": "NLP Applications", "label": 0}, {"loc": [0.848892867565155, 8.112730026245117], "id": 4227, "title": "Towards relation extraction from speech", "authors": "Tongtong Wu, Guitao Wang, Jinming Zhao, Zhaoran Liu, Guilin Qi, Yuan-Fang Li and Gholamreza Haffari", "abstract": "Relation extraction typically aims to extract semantic relationships between entities from the unstructured text.\nOne of the most essential data sources for relation extraction is the spoken language, such as interviews and dialogues.\nHowever, the error propagation introduced in automatic speech recognition (ASR) has been ignored in relation extraction, and the end-to-end speech-based relation extraction method has been rarely explored.\nIn this paper, we propose a new listening information extraction task, i.e., speech relation extraction.\nWe construct the training dataset for speech relation extraction via text-to-speech systems, and we construct the testing dataset via crowd-sourcing with native English speakers.\nWe explore speech relation extraction via two approaches: the pipeline approach conducting text-based extraction with a pretrained ASR module, and the end2end approach via a new proposed encoder-decoder model, or what we called SpeechRE.\nWe conduct comprehensive experiments to distinguish the challenges in speech relation extraction, which may shed light on future explorations. We share the code and data on https://github.com/wutong8023/SpeechRE.", "track": "Information Extraction", "label": 5}, {"loc": [4.064045429229736, 7.129462718963623], "id": 4230, "title": "Structural Constraints and Natural Language Inference for End-to-End Flowchart Grounded Dialog Response Generation", "authors": "Dinesh Raghu, Suraj Joshi, Sachindra Joshi and Mausam", "abstract": "Flowchart grounded dialog systems converse with users by following a given flowchart and a corpus of FAQs. The existing state-of-the-art approach (Raghu et al, 2021) for learning such a dialog system, named FLONET, has two main limitations. (1) It uses a Retrieval Augmented Generation (RAG) framework which represents a flowchart as a bag of nodes. By doing so, it loses the connectivity structure between nodes that can aid in better response generation. (2) Typically dialogs progress with the agent asking polar (Y/N) questions, but users often respond indirectly without the explicit use of polar words. In such cases, it fails to understand the correct polarity of the answer. To overcome these issues, we propose Structure-Aware FLONET (SA-FLONET) which infuses structural constraints derived from the connectivity structure of flowcharts into the RAG framework. It uses natural language inference to better predict the polarity of indirect Y/N answers. We find that SA-FLONET outperforms FLONET, with a success rate improvement of 68% and 123% in flowchart grounded response generation and zero-shot flowchart grounded response generation tasks respectively.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [1.754991054534912, 9.100829124450684], "id": 4234, "title": "SLICER: Sliced Fine-Tuning for Low-Resource Cross-Lingual Transfer for Named Entity Recognition", "authors": "Fabian David Schmidt, Ivan Vuli\u0107 and Goran Glava\u0161", "abstract": "Large multilingual language models generally demonstrate impressive results in zero-shot cross-lingual transfer, yet often fail to successfully transfer to low-resource languages, even for token-level prediction tasks like named entity recognition (NER). In this work, we introduce a simple yet highly effective approach for improving zero-shot transfer for NER to low-resource languages. We observe that NER fine-tuning in the source language decontextualizes token representations, i.e., tokens increasingly attend to themselves. This increased reliance on token information itself, we hypothesize, triggers a type of overfitting to properties that NE tokens within the source languages share, but are generally not present in NE mentions of target languages. As a remedy, we propose a simple yet very effective sliced fine-tuning for NER (SLICER) that forces stronger token contextualization in the Transformer: we divide the transformed token representations and classifier into disjoint slices that are then independently classified during training. We evaluate SLICER on two standard benchmarks for NER that involve low-resource languages, WikiANN and MasakhaNER, and show that it (i) indeed reduces decontextualization (i.e., extent to which NE tokens attend to themselves), consequently (ii) yielding consistent transfer gains, especially prominent for low-resource target languages distant from the source language.", "track": "Multilinguality", "label": 13}, {"loc": [5.8835883140563965, 8.87955379486084], "id": 4260, "title": "EdgeFormer: A Parameter-Efficient Transformer for On-Device Seq2seq Generation", "authors": "Tao Ge, Si-Qing Chen and Furu Wei", "abstract": "We introduce EdgeFormer -- a parameter-efficient Transformer for on-device seq2seq generation under the strict computation and memory constraints. Compared with the previous parameter-efficient Transformers, EdgeFormer applies two novel principles for cost-effective parameterization, allowing it to perform better given the same parameter budget; moreover, EdgeFormer is further enhanced by layer adaptation innovation that is proposed for improving the network with shared layers.\n\nExtensive experiments show EdgeFormer can effectively outperform previous parameter-efficient Transformer baselines and achieve competitive results under both the computation and memory constraints. Given the promising results, we release EdgeLM -- the pretrained version of EdgeFormer, which is the first publicly available pretrained on-device seq2seq model that can be easily fine-tuned for seq2seq tasks with strong results, facilitating on-device seq2seq generation in practice.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [5.495899200439453, 12.272038459777832], "id": 4267, "title": "End-to-End Unsupervised Vision-and-Language Pre-training with Referring Expression Matching", "authors": "Chi Chen, Peng Li, Maosong Sun and Yang Liu", "abstract": "Recently there has been an emerging interest in unsupervised vision-and-language pre-training (VLP) that learns multimodal representations without parallel image-caption data. These pioneering works significantly reduce the cost of VLP on data collection and achieve promising results compared to supervised VLP. However, existing unsupervised VLP methods take as input pre-extracted region-based visual features from external object detectors, which both limits flexibility and reduces computational efficiency. In this paper, we explore end-to-end unsupervised VLP with a vision encoder to directly encode images. The vision encoder is pre-trained on image-only data and jointly optimized during multimodal pre-training. To further enhance the learned cross-modal features, we propose a novel pre-training task that predicts which patches contain an object referred to in natural language from the encoded visual features. Extensive experiments on four vision-and-language tasks show that our approach outperforms previous unsupervised VLP methods and obtains new state-of-the-art results.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [4.724325656890869, 3.404350757598877], "id": 4268, "title": "Faithful Knowledge Graph Explanations in Commonsense Question Answering", "authors": "Guy Aglionby and Simone Teufel", "abstract": "Knowledge graphs are commonly used as sources of information in commonsense question answering, and can also be used to express explanations for the model's answer choice. A common way of incorporating facts from the graph is to encode them separately from the question, and then combine the two representations to select an answer. In this paper, we argue that highly faithful graph-based explanations cannot be extracted from existing models of this type. Such explanations will not include reasoning done by the transformer encoding the question, so will be incomplete. We confirm this theory with a novel proxy measure for faithfulness and propose two architecture changes to address the problem. Our findings suggest a path forward for developing architectures for faithful graph-based explanations.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.5805559158325195, 12.31023120880127], "id": 4292, "title": "KOLD: Korean Offensive Language Dataset", "authors": "Younghoon Jeong, Juhyun Oh, Jongwon Lee, Jaimeen Ahn, Jihyung Moon, Sungjoon Park and Alice Oh", "abstract": "Recent directions for offensive language detection are hierarchical modeling, identifying the type and the target of offensive language, and interpretability with offensive span annotation and prediction. These improvements are focused on English and do not transfer well to other languages because of cultural and linguistic differences. In this paper, we present the Korean Offensive Language Dataset (KOLD) comprising 40,429 comments, which are annotated hierarchically with the type and the target of offensive language, accompanied by annotations of the corresponding text spans. We collect the comments from NAVER news and YouTube platform and provide the titles of the articles and videos as the context information for the annotation process. We use these annotated comments as training data for Korean BERT and RoBERTa models and find that they are effective at offensiveness detection, target classification, and target span detection while having room for improvement for target group classification and offensive span detection. We discover that the target group distribution differs drastically from the existing English datasets, and observe that providing the context information improves the model performance in offensiveness detection (+0.3), target classification (+1.5), and target group classification (+13.1). We publicly release the dataset and baseline models.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.035600185394287, 8.883742332458496], "id": 4294, "title": "Evade the Trap of Mediocrity: Promoting Diversity and Novelty in Text Generation via Concentrating Attention", "authors": "Wenhao Li, Xiaoyuan Yi, Jinyi Hu, Maosong Sun and Xing Xie", "abstract": "Recently, powerful Transformer architectures have proven superior in generating high-quality sentences. Nevertheless, these models tend to produce dull high-frequency phrases, severely hurting the diversity and novelty of generated text. In this work, we dig into the intrinsic mechanism of this problem and found that sparser attention values in Transformer could improve diversity. To understand such a phenomenon, we first conduct both empirical and theoretical analysis and then attribute it to representation degeneration caused by the attentive mixture of the hidden states during training. We term this process the Trap of Mediocrity. To escape from such a trap, we introduce a novel attention regularization loss to control the sharpness of the attention distribution, which is transparent to model structures and can be easily implemented within 20 lines of python code. We prove that this method could be mathematically regarded as learning a Bayesian approximation of posterior attention. Experiments show that our method improved the diversity and novelty of the generated text while maintaining comparable quality on a variety of conditional and unconditional generation tasks.", "track": "Natural Language Generation", "label": 6}, {"loc": [6.872811794281006, 6.209075927734375], "id": 4295, "title": "The better your Syntax, the better your Semantics? Probing Pretrained Language Models for the English Comparative Correlative", "authors": "Leonie Weissweiler, Valentin Hofmann, Abdullatif K\u00f6ksal and Hinrich Sch\u00fctze", "abstract": "Construction Grammar (CxG) is a paradigm from cognitive linguistics emphasising the connection between syntax and semantics. Rather than rules that operate on lexical items, it posits constructions as the central building blocks of language, i.e., linguistic units of different granularity that combine syntax and semantics. As a first step towards assessing the compatibility of CxG with the syntactic and semantic knowledge demonstrated by state-of-the-art pretrained language models (PLMs), we present an investigation of their capability to classify and understand one of the most commonly studied constructions, the English comparative correlative (CC). We conduct experiments examining the classification accuracy of a syntactic probe on the one hand and the models' behaviour in a semantic application task on the other, with BERT, RoBERTa, and DeBERTa as the example PLMs. Our results show that all three investigated PLMs are able to recognise the structure of the CC but fail to use its meaning. While human-like performance of PLMs on many NLP tasks has been alleged, this indicates that PLMs still suffer from substantial shortcomings in central domains of linguistic knowledge.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [4.528491497039795, 4.4971842765808105], "id": 4299, "title": "ProofInfer: Generating Proof via Iterative Hierarchical Inference", "authors": "Zichu Fei, Qi Zhang, Xin Zhou, Tao Gui and Xuanjing Huang", "abstract": "Proof generation focuses on deductive reasoning: given a hypothesis and a set of theories, including some supporting facts and logical rules expressed in natural language, the model generates a proof tree indicating how to deduce the hypothesis from given theories.\nCurrent models with state-of-the-art performance employ the stepwise method that adds an individual node to the proof step-by-step.\nHowever, these methods actually focus on generating several proof paths rather than a whole tree.\nDuring generation, they focus on the most relevant areas of the currently generated node while neglecting the rest of the proof tree. \nTo address this problem, we propose ProofInfer, which generates the proof tree via iterative hierarchical inference.\nAt each step, ProofInfer adds the entire layer to the proof, where all nodes in this layer are generated simultaneously. \nSince the conventional autoregressive generation architecture cannot simultaneously predict multiple nodes, ProofInfer employs text-to-text paradigm.\nTo this end, we propose a divide-and-conquer algorithm to encode the proof tree as the plain text without losing structure information.\nExperimental results show that ProofInfer significantly improves performance on several widely-used datasets.\nIn addition, ProofInfer still performs well with data-limited, achieving comparable performance to the state-of-the-art model with about 40% of the training data.", "track": "Natural Language Generation", "label": 6}, {"loc": [3.7223188877105713, 9.365842819213867], "id": 4300, "title": "ECTSum: A New Benchmark Dataset For Bullet Point Summarization of Long Earnings Call Transcripts", "authors": "Rajdeep Mukherjee, Abhinav Bohra, Akash Banerjee, Soumya Sharma, Manjunath Hegde, Afreen Shaikh, Shivani Shrivastava, Koustuv Dasgupta, Niloy Ganguly, Saptarshi Ghosh and Pawan Goyal", "abstract": "Despite tremendous progress in automatic summarization, state-of-the-art methods are predominantly trained to excel in summarizing short newswire articles, or documents with strong layout biases such as scientific articles or government reports. Efficient techniques to summarize financial documents, discussing facts and figures, have largely been unexplored, majorly due to the unavailability of suitable datasets. In this work, we present ECTSum, a new dataset with transcripts of earnings calls (ECTs), hosted by publicly traded companies, as documents, and experts-written short telegram-style bullet point summaries derived from corresponding Reuters articles. ECTs are long unstructured documents without any prescribed length limit or format. We benchmark our dataset with state-of-the-art summarization methods across various metrics evaluating the content quality and factual consistency of the generated summaries. Finally, we present a simple yet effective approach, ECT-BPS, to generate a set of bullet points that precisely capture the important facts discussed in the calls.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.340866565704346, 6.84141206741333], "id": 4348, "title": "Cross-domain Generalization for AMR Parsing", "authors": "Xuefeng Bai, Sen Yang, Leyang Cui, Linfeng Song and Yue Zhang", "abstract": "Abstract Meaning Representation (AMR) parsing aims to predict an AMR graph from textual input. \nRecently, there has been notable growth in AMR parsing performance. However, most existing work focuses on improving the performance in the specific domain, ignoring the potential domain dependence of AMR parsing systems. To address this, we extensively evaluate five representative AMR parsers on five domains and analyze challenges to cross-domain AMR parsing. We observe that challenges to cross-domain AMR parsing mainly arise from the distribution shift of words and AMR concepts. Based on our observation, we investigate two approaches to reduce the domain distribution divergence of text and AMR features, respectively. Experimental results on two out-of-domain test sets show the superiority of our method.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [3.598658561706543, 9.731589317321777], "id": 4352, "title": "CiteSum: Citation Text-guided Scientific Extreme Summarization and Domain Adaptation with Limited Supervision", "authors": "Yuning Mao, Ming Zhong and Jiawei Han", "abstract": "Scientific extreme summarization (TLDR) aims to form ultra-short summaries of scientific papers. Previous efforts on curating scientific TLDR datasets failed to scale up due to the heavy human annotation and domain expertise required. In this paper, we propose a simple yet effective approach to automatically extracting TLDR summaries for scientific papers from their citation texts. Based on the proposed approach, we create a new benchmark CiteSum without human annotation, which is around 30 times larger than the previous human-curated dataset SciTLDR. We conduct a comprehensive analysis of CiteSum, examining its data characteristics and establishing strong baselines. We further demonstrate the usefulness of CiteSum by adapting models pre-trained on CiteSum (named CITES) to new tasks and domains with limited supervision. For scientific extreme summarization, CITES outperforms most fully-supervised methods on SciTLDR without any fine-tuning and obtains state-of-the-art results with only 128 examples. For news extreme summarization, CITES achieves significant gains on XSum over its base model (not pre-trained on CiteSum), e.g., +7.2 ROUGE-1 zero-shot performance and state-of-the-art few-shot performance. For news headline generation, CITES performs the best among unsupervised and zero-shot methods on Gigaword.", "track": "Summarization", "label": 14}, {"loc": [4.251677989959717, 7.323694229125977], "id": 4356, "title": "FETA: A Benchmark for Few-Sample Task Transfer in Open-Domain Dialogue", "authors": "Alon Albalak, Yi-Lin Tuan, Pegah Jandaghi, Connor Pryor, Luke Yoffe, Deepak Ramachandran, Lise Getoor, Jay Pujara and William Yang Wang", "abstract": "Task transfer, transferring knowledge contained in related tasks, holds the promise of reducing the quantity of labeled data required to fine-tune language models. Dialogue understanding encompasses many diverse tasks, yet task transfer has not been thoroughly studied in conversational AI. This work explores conversational task transfer by introducing FETA: a benchmark for FEw-sample TAsk transfer in open-domain dialogue.\nFETA contains two underlying sets of conversations upon which there are 10 and 7 tasks annotated, enabling the study of intra-dataset task transfer; task transfer without domain adaptation. We utilize three popular language models and three learning algorithms to analyze the transferability between 132 source-target task pairs and create a baseline for future work.\nWe run experiments in the single- and multi-source settings and report valuable findings, e.g., most performance trends are model-specific, and span extraction and multiple-choice tasks benefit the most from task transfer.\nIn addition to task transfer, FETA can be a valuable resource for future research into the efficiency and generalizability of pre-training datasets and model architectures, as well as for learning settings such as continual and multitask learning.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [4.882094860076904, 3.3823673725128174], "id": 4357, "title": "Do Children Texts Hold The Key To Commonsense Knowledge?", "authors": "Julien Romero and Simon Razniewski", "abstract": "Compiling comprehensive repositories of commonsense knowledge is a long-standing problem in AI. Many concerns revolve around the issue of reporting bias, i.e., that frequency in text sources is not a good proxy for relevance or truth. This paper explores whether children's texts hold the key to commonsense knowledge compilation, based on the hypothesis that such content makes fewer assumptions on the reader's knowledge, and therefore spells out commonsense more explicitly. An analysis with several corpora shows that children's texts indeed contain much more, and more typical commonsense assertions. Moreover, experiments show that this advantage can be leveraged in popular language-model-based commonsense knowledge extraction settings, where task-unspecific fine-tuning on small amounts of children texts (childBERT) already yields significant improvements. This provides a refreshing perspective different from the common trend of deriving progress from ever larger models and corpora.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [5.56419038772583, 8.528447151184082], "id": 4360, "title": "On the Limitations of Reference-Free Evaluations of Generated Text", "authors": "Daniel Deutsch, Rotem Dror and Dan Roth", "abstract": "There is significant interest in developing evaluation metrics which accurately estimate the quality of generated text without the aid of a human-written reference text, which can be time consuming and expensive to collect or entirely unavailable in online applications. However, in this work, we demonstrate that these reference-free metrics are inherently biased and limited in their ability to evaluate generated text, and we argue that they should not be used to measure progress on tasks like machine translation or summarization. We show how reference-free metrics are equivalent to using one generation model to evaluate another, which has several limitations: (1) the metrics can be optimized at test time to find the approximate best-possible output, (2) they are inherently biased toward models which are more similar to their own, and (3) they can be biased against higher-quality outputs, including those written by humans. Therefore, we recommend that reference-free metrics should be used as diagnostic tools for analyzing and understanding model behavior instead of measures of how well models perform a task, in which the goal is to achieve as high of a score as possible.", "track": "Resources and Evaluation", "label": 1}, {"loc": [10.788838386535645, 6.927899360656738], "id": 4363, "title": "Sampling-Based Approximations to Minimum Bayes Risk Decoding for Neural Machine Translation", "authors": "Bryan Eikema and Wilker Aziz", "abstract": "In NMT we search for the mode of the model distribution to form predictions. The mode and other high-probability translations found by beam search have been shown to often be inadequate in a number of ways. This prevents improving translation quality through better search, as these idiosyncratic translations end up selected by the decoding algorithm, a problem known as the beam search curse. Recently, an approximation to minimum Bayes risk (MBR) decoding has been proposed as an alternative decision rule that would likely not suffer from the same problems. We analyse this approximation and establish that it has no equivalent to the beam search curse. We then design approximations that decouple the cost of exploration from the cost of robust estimation of expected utility. This allows for much larger hypothesis spaces, which we show to be beneficial. We also show that mode-seeking strategies can aid in constructing compact sets of promising hypotheses and that MBR is effective in identifying good translations in them. We conduct experiments on three language pairs varying in amounts of resources available: English into and from German, Romanian, and Nepali.", "track": "Machine Translation", "label": 10}, {"loc": [9.128528594970703, 6.429752826690674], "id": 4369, "title": "IndicXNLI: Evaluating Multilingual Inference for Indian Languages", "authors": "Divyanshu Aggarwal, Vivek Gupta and Anoop Kunchukuttan", "abstract": "While Indic NLP has made rapid advances recently in terms of the availability of corpora and pre-trained models, benchmark datasets on standard NLU tasks are limited. To this end, we introduce INDICXNLI, an NLI dataset for 11 Indic languages. It has been created by high-quality machine translation of the original English XNLI dataset and our analysis attests to the quality of INDICXNLI. By finetuning different pre-trained LMs on this INDICXNLI, we analyze various cross-lingual transfer techniques with respect to the impact of the choice of language models, languages, multi-linguality, mix-language input, etc. These experiments provide us with useful insights into the behaviour of pre-trained models for a diverse set of languages.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [8.537271499633789, 8.126812934875488], "id": 4374, "title": "Model Cascading: Towards Jointly Improving Efficiency and Accuracy of NLP Systems", "authors": "Neeraj Varshney and Chitta Baral", "abstract": "Do all instances need inference through the big models for a correct prediction? Perhaps not; some instances are easy and can be answered correctly by even small capacity models. This provides opportunities for improving the computational efficiency of systems. In this work, we present an explorative study on 'model cascading', a simple technique that utilizes a collection of models of varying capacities to accurately yet efficiently output predictions. Through comprehensive experiments in multiple task settings that differ in the number of models available for cascading (K value), we show that cascading improves both the computational efficiency and the prediction accuracy. For instance, in K=3 setting, cascading saves up to 88.93% computation cost and consistently achieves superior prediction accuracy with an improvement of up to 2.18%. We also study the impact of introducing additional models in the cascade and show that it further increases the efficiency improvements. Finally, we hope that our work will facilitate development of efficient NLP systems making their widespread adoption in real-world applications possible.", "track": "Efficient Methods for NLP", "label": 12}, {"loc": [1.0585951805114746, 10.530745506286621], "id": 4377, "title": "Semantic Simplification for Sentiment Classification", "authors": "Xiaotong Jiang, Zhongqing Wang and Guodong Zhou", "abstract": "Recent work on document-level sentiment classification has shown that the sentiment in the original text is often hard to capture, since the sentiment is usually either expressed implicitly or shifted due to the occurrences of negation and rhetorical words. To this end, we enhance the original text with a sentiment-driven simplified clause to intensify its sentiment. The simplified clause shares the same opinion with the original text but expresses the opinion much more simply. Meanwhile, we employ Abstract Meaning Representation (AMR) for generating simplified clauses, since AMR explicitly provides core semantic knowledge, and potentially offers core concepts and explicit structures of original texts. Empirical studies show the effectiveness of our proposed model over several strong baselines. The results also indicate the importance of simplified clauses for sentiment classification.", "track": "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "label": 16}, {"loc": [8.60241413116455, 8.419034957885742], "id": 4380, "title": "XPrompt: Exploring the Extreme of Prompt Tuning", "authors": "Fang Ma, Chen Zhang, Lei Ren, Jingang Wang, Qifan Wang, Wei Wu, Xiaojun Quan and Dawei Song", "abstract": "Prompt tuning learns soft prompts to condition the frozen Pre-trained Language Models (PLMs) for performing downstream tasks in a parameter-efficient manner. While prompt tuning has gradually reached the performance level of fine-tuning as the model scale increases, there is still a large performance gap between prompt tuning and fine-tuning for models of moderate and small scales (typically less than 11B parameters). In this paper, we empirically show that the trained prompt tokens can have a negative impact on a downstream task and thus degrade its performance. To bridge the gap, we propose a novel Prompt tuning model with an eXtremely small scale (XPrompt) under the regime of lottery tickets hypothesis. Specifically, XPrompt eliminates the negative prompt tokens at different granularity levels through a hierarchical structured pruning, yielding a more parameter-efficient prompt yet with a competitive performance. Comprehensive experiments are carried out on the SuperGLUE tasks, and the results indicate that XPrompt is able to close the performance gap at smaller model scales.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.549868106842041, 9.047859191894531], "id": 4382, "title": "Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?", "authors": "Sewon Min, Xinxi Lyu, Ari Holtzman, Mikel Artetxe, Mike Lewis, Hannaneh Hajishirzi and Luke Zettlemoyer", "abstract": "Large language models (LMs) are able to in-context learn\u2014perform a new task via inference alone by conditioning on a few input-label pairs (demonstrations) and making predictions for new inputs. However, there has been little understanding of how the model learns and which aspects of the demonstrations contribute to end task performance. In this paper, we show that ground truth demonstrations are in fact not required\u2014randomly replacing labels in the demonstrations barely hurts performance on a range of classification and multi-choce tasks, consistently over 12 different models including GPT-3. Instead, we find that other aspects of the demonstrations are the key drivers of end\ntask performance, including the fact that they provide a few examples of (1) the label space, (2) the distribution of the input text, and (3) the overall format of the sequence. Together, our analysis provides a new way of understanding how and why in-context learning works, while opening up new questions about how much can be learned from large language models through inference alone.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.823146343231201, 6.184777736663818], "id": 4384, "title": "The Curious Case of Control", "authors": "Elias Stengel-Eskin and Benjamin Van Durme", "abstract": "Children acquiring English make systematic errors on subject control sentences even after they have reached near-adult competence (Chomsky, 1969), possibly due to heuristics based on semantic roles (Maratsos, 1974).\nGiven the advanced fluency of large generative language models, we ask whether model outputs are consistent with these heuristics, and to what degree different models are consistent with each other. We find that models can be categorized by behavior into three separate groups, with broad differences between the groups. The outputs of models in the largest group are consistent with positional heuristics that succeed on subject control but fail on object control. This result is surprising, given that object control is orders of magnitude more frequent in the text data used to train such models. We examine to what degree the models are sensitive to prompting with agent-patient information, finding that raising the salience of agent and patient relations results in significant changes in the outputs of most models. Based on this observation, we leverage an existing dataset of semantic proto-role annotations (White et al. 2020) to explore the connections between control and labeling event participants with properties typically associated with agents and patients.", "track": "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "label": 22}, {"loc": [5.572396278381348, 8.836969375610352], "id": 4385, "title": "SHARE: a System for Hierarchical Assistive Recipe Editing", "authors": "Shuyang Li, Yufei Li, Jianmo Ni and Julian McAuley", "abstract": "The large population of home cooks with dietary restrictions is under-served by existing cooking resources and recipe generation models. To help them, we propose the task of controllable recipe editing: adapt a base recipe to satisfy a user-specified dietary constraint. This task is challenging, and cannot be adequately solved with human-written ingredient substitution rules or existing end-to-end recipe generation models. We tackle this problem with SHARE: a System for Hierarchical Assistive Recipe Editing, which performs simultaneous ingredient substitution before generating natural-language steps using the edited ingredients. By decoupling ingredient and step editing, our step generator can explicitly integrate the available ingredients. Experiments on the novel RecipePairs dataset---83K pairs of similar recipes where each recipe satisfies one of seven dietary constraints---demonstrate that SHARE produces convincing, coherent recipes that are appropriate for a target dietary constraint. We further show through human evaluations and real-world cooking trials that recipes edited by SHARE can be easily followed by home cooks to create appealing dishes.", "track": "NLP Applications", "label": 0}, {"loc": [4.349253177642822, 7.272778034210205], "id": 4392, "title": "IM^2: an Interpretable and Multi-category Integrated Metric Framework for Automatic Dialogue Evaluation", "authors": "Zhihua Jiang, Guanghui Ye, Dongning Rao, Di Wang and Xin Miao", "abstract": "Evaluation metrics shine the light on the best models and thus strongly influence the research directions, such as the recently developed dialogue metrics USR, FED, and GRADE. However, most current metrics evaluate the dialogue data as isolated and static because they only focus on a single quality or several qualities. To mitigate the problem, this paper proposes an interpretable, multi-faceted, and controllable framework IM^2 (Interpretable and Multi-category Integrated Metric) to combine a large number of metrics which are good at measuring different qualities. The IM^2 framework first divides current popular dialogue qualities into different categories and then applies or proposes dialogue metrics to measure the qualities within each category and finally generates an overall IM^2 score. An initial version of IM^2 was submitted to the AAAI 2022 Track5.1@DSTC10 challenge and took the 2^nd place on both of the development and test leaderboard. After the competition, we develop more metrics and improve the performance of our model. We compare IM^2 with other 13 current dialogue metrics and experimental results show that IM^2 correlates more strongly with human judgments than any of them on each evaluated dataset.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.348541259765625, 12.146551132202148], "id": 4394, "title": "PEVL: Position-enhanced Pre-training and Prompt Tuning for Vision-language Models", "authors": "Yuan Yao, qianyu chen, Ao Zhang, Wei Ji, Zhiyuan Liu, Tat-Seng Chua and Maosong Sun", "abstract": "Vision-language pre-training (VLP) has shown impressive performance on a wide range of cross-modal tasks, where VLP models without reliance on object detectors are becoming the mainstream due to their superior computation efficiency and competitive performance. However, the removal of object detectors also deprives the capability of VLP models in explicit object modeling, which is essential to various position-sensitive vision-language (VL) tasks, such as referring expression comprehension and visual commonsense reasoning. To address the challenge, we introduce PEVL that enhances the pre-training and prompt tuning of VLP models with explicit object position modeling. Specifically, PEVL reformulates discretized object positions and language in a unified language modeling framework, which facilitates explicit VL alignment during pre-training, and also enables flexible prompt tuning for various downstream tasks. We show that PEVL enables state-of-the-art performance of detector-free VLP models on position-sensitive tasks such as referring expression comprehension and phrase grounding, and also improves the performance on position-insensitive tasks with grounded inputs. We make the data and code for this paper publicly available at https://github.com/thunlp/PEVL.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [5.0212883949279785, 3.7368695735931396], "id": 4396, "title": "Pre-training Language Models with Deterministic Factual Knowledge", "authors": "Shaobo Li, Xiaoguang Li, Lifeng Shang, Chengjie Sun, Bingquan Liu, zhenzhou Ji, Xin Jiang and Qun Liu", "abstract": "Previous works show that Pre-trained Language Models (PLMs) can capture factual knowledge. However, some analyses reveal that PLMs fail to perform it robustly, e.g., being sensitive to the changes of prompts when extracting factual knowledge. To mitigate this issue, we propose to let PLMs learn the deterministic relationship between the remaining context and the masked content. The deterministic relationship ensures that the masked factual content can be deterministically inferable based on the existing clues in the context. That would provide more stable patterns for PLMs to capture factual knowledge than randomly masking. Two pre-training tasks are further introduced to motivate PLMs to rely on the deterministic relationship when filling masks. Specifically, we use an external Knowledge Base (KB) to identify deterministic relationships and continuously pre-train PLMs with the proposed methods. The factual knowledge probing experiments indicate that the continuously pre-trained PLMs achieve better robustness in factual knowledge capturing. Further experiments on question-answering datasets show that trying to learn a deterministic relationship with the proposed methods can also help other knowledge-intensive tasks.", "track": "Question Answering", "label": 11}, {"loc": [8.280447006225586, 7.92151403427124], "id": 4397, "title": "Finding Skill Neurons in Pre-trained Transformer-based Language Models", "authors": "Xiaozhi Wang, Kaiyue Wen, Zhengyan Zhang, Lei Hou, Zhiyuan Liu and Juanzi Li", "abstract": "Transformer-based pre-trained language models have demonstrated superior performance on various natural language processing tasks. However, it remains unclear how the skills required to handle these tasks distribute among model parameters. In this paper, we find that after prompt tuning for specific tasks, the activations of some neurons within pre-trained Transformers are highly predictive of the task labels. We dub these neurons skill neurons and confirm they encode task-specific skills by finding that: (1) Skill neurons are crucial for handling tasks. Performances of pre-trained Transformers on a task significantly drop when corresponding skill neurons are perturbed. (2) Skill neurons are task-specific. Similar tasks tend to have similar distributions of skill neurons. Furthermore, we demonstrate the skill neurons are most likely generated in pre-training rather than fine-tuning by showing that the skill neurons found with prompt tuning are also crucial for other fine-tuning methods freezing neuron weights, such as the adapter-based tuning and BitFit. We also explore the applications of skill neurons, including accelerating Transformers with network pruning and building better transferability indicators. These findings may promote further research on understanding Transformers. The source code can be obtained from https://github.com/THU-KEG/Skill-Neuron.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [7.779642105102539, 8.905840873718262], "id": 4398, "title": "Prompt Conditioned VAE: Enhancing Generative Replay for Lifelong Learning in Task-Oriented Dialogue", "authors": "Yingxiu Zhao, Yinhe Zheng, Zhiliang Tian, Chang Gao, Jian Sun and Nevin L. Zhang", "abstract": "Lifelong learning (LL) is vital for advanced task-oriented dialogue (ToD) systems. To address the catastrophic forgetting issue of LL, generative replay methods are widely employed to consolidate past knowledge with generated pseudo samples. However, most existing generative replay methods use only a single task-specific token to control their models. This scheme is usually not strong enough to constrain the generative model due to insufficient information involved. In this paper, we propose a novel method, prompt conditioned VAE for lifelong learning (PCLL), to enhance generative replay by incorporating tasks' statistics. PCLL captures task-specific distributions with a conditional variational autoencoder, conditioned on natural language prompts to guide the pseudo-sample generation. Moreover, it leverages a distillation process to further consolidate past knowledge by alleviating the noise in pseudo samples. Experiments on natural language understanding tasks of ToD systems demonstrate that PCLL significantly outperforms competitive baselines in building lifelong learning models.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [10.87362289428711, 6.844954967498779], "id": 4401, "title": "PreQuEL: Quality Estimation of Machine Translation Outputs in Advance", "authors": "Shachar Don-Yehiya, Leshem Choshen and Omri Abend", "abstract": "We present the task of PreQuEL, Pre-(Quality-Estimation) Learning. A PreQuEL system predicts how well a given sentence will be translated, without recourse to the actual translation, thus eschewing unnecessary resource allocation when translation quality is bound to be low. \nPreQuEL can be defined relative to a given MT system (e.g., some industry service) or generally relative to the state-of-the-art.\nFrom a theoretical perspective, PreQuEL places the focus on the source text, tracing properties, possibly linguistic features, that make a sentence harder to machine translate.\n\nWe develop a baseline model for the task and analyze its performance. We also develop a data augmentation method (from parallel corpora), that improves results substantially. We show that this augmentation method can improve the performance of the Quality-Estimation task as well.\nWe investigate the properties of the input text that our model is sensitive to, by testing it on challenge sets and different languages. We conclude that it is aware of syntactic and semantic distinctions, and correlates and even over-emphasizes the importance of standard NLP features.", "track": "Machine Translation", "label": 10}, {"loc": [4.525197982788086, 4.387191295623779], "id": 4407, "title": "Can Transformers Reason in Fragments of Natural Language?", "authors": "Viktor Schlegel, Kamen V. Pavlov and Ian Pratt-Hartmann", "abstract": "State-of-the-art deep-learning-based approaches to Natural Language Processing (NLP) are credited with various capabilities that involve reasoning with natural language texts. %However, reasoning in this setting is often ill-defined and shallow. \nIn this paper we carry out a large-scale empirical study investigating the detection of formally valid inferences in controlled fragments of natural language for which the satisfiability problem becomes increasingly complex. We find that, while transformer-based language models perform surprisingly well in these scenarios, a deeper analysis reveals that they appear to overfit to superficial patterns in the data rather than acquiring the logical principles governing the reasoning in these fragments.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [9.875997543334961, 7.881039142608643], "id": 4408, "title": "Textless Speech Emotion Conversion using Discrete & Decomposed Representations", "authors": "Felix Kreuk, Adam Polyak, Jade Copet, Eugene Kharitonov, Tu Anh Nguyen, Morgan Rivi\u00e8re, Wei-Ning Hsu, Abdelrahman Mohamed, Emmanuel Dupoux and Yossi Adi", "abstract": "Speech emotion conversion is the task of modifying the perceived emotion of a speech utterance while preserving the lexical content and speaker identity. In this study, we cast the problem of emotion conversion as a spoken language translation task. We use a decomposition of the speech signal into discrete learned representations, consisting of phonetic-content units, prosodic features, speaker, and emotion. First, we modify the speech content by translating the phonetic-content units to a target emotion, and then predict the prosodic features based on these units. Finally, the speech waveform is generated by feeding the predicted representations into a neural vocoder. Such a paradigm allows us to go beyond spectral and parametric changes of the signal, and model non-verbal vocalizations, such as laughter insertion, yawning removal, etc. We demonstrate objectively and subjectively that the proposed method is vastly superior to current approaches and even beats text-based systems in terms of perceived emotion and audio quality. We rigorously evaluate all components of such a complex system and conclude with an extensive model analysis and ablation study to better emphasize the architectural choices, strengths and weaknesses of the proposed method. Samples are available under the following link: https://speechbot.github.io/emotion", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.917130470275879, 3.2991676330566406], "id": 4409, "title": "Textual Backdoor Attacks Can Be More Harmful via Two Simple Tricks", "authors": "Yangyi Chen, Fanchao Qi, Hongcheng Gao, Zhiyuan Liu and Maosong Sun", "abstract": "Backdoor attacks are a kind of emergent security threat in deep learning. After being injected with a backdoor, a deep neural model will behave normally on standard inputs but give adversary-specified predictions once the input contains specific backdoor triggers. In this paper, we find two simple tricks that can make existing textual backdoor attacks much more harmful. The first trick is to add an extra training task to distinguish poisoned and clean data during the training of the victim model, and the second one is to use all the clean training data rather than remove the original clean data corresponding to the poisoned data. These two tricks are universally applicable to different attack models. We conduct experiments in three tough situations including clean data fine-tuning, low-poisoning-rate, and label-consistent attacks. Experimental results show that the two tricks can significantly improve attack performance. This paper exhibits the great potential harmfulness of backdoor attacks. All the code and data can be obtained at \\url{https://github.com/thunlp/StyleAttack}.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.623270034790039, 3.6370835304260254], "id": 4413, "title": "Why Should Adversarial Perturbations be Imperceptible? Rethink the Research Paradigm in Adversarial NLP", "authors": "Yangyi Chen, Hongcheng Gao, Ganqu CUI, Fanchao Qi, Longtao Huang, Zhiyuan Liu and Maosong Sun", "abstract": "Textual adversarial samples play important roles in multiple subfields of NLP research, including security, evaluation, explainability, and data augmentation. However, most work mixes all these roles, obscuring the problem definitions and research goals of the security role that aims to reveal the practical concerns of NLP models. In this paper, we rethink the research paradigm of textual adversarial samples in security scenarios. We discuss the deficiencies in previous work and propose our suggestions that the research on the Security-oriented adversarial NLP (SoadNLP) should: (1) evaluate their methods on security tasks to demonstrate the real-world concerns; (2) consider real-world attackers' goals, instead of developing impractical methods. To this end, we first collect, process, and release a security datasets collection Advbench. Then, we reformalize the task and adjust the emphasis on different goals in SoadNLP. Next, we propose a simple method based on heuristic rules that can easily fulfill the actual adversarial goals to simulate real-world attack methods. We conduct experiments on both the attack and the defense sides on Advbench. Experimental results show that our method has higher practical value, indicating that the research paradigm in SoadNLP may start from our new benchmark. All the code and data of Advbench can be obtained at \\url{https://github.com/thunlp/Advbench}.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.42486572265625, 12.172715187072754], "id": 4414, "title": "Retrieval Augmented Visual Question Answering with Outside Knowledge", "authors": "Weizhe Lin and Bill Byrne", "abstract": "Outside-Knowledge Visual Question Answering (OK-VQA) is a challenging VQA task that requires retrieval of external knowledge to answer questions about images. Recent OK-VQA systems use Dense Passage Retrieval (DPR) to retrieve documents from external knowledge bases, such as Wikipedia, but with DPR trained separately from answer generation, introducing a potential limit on the overall system performance.\nInstead, we propose a joint training scheme which includes differentiable DPR integrated with answer generation so that the system can be trained in an end-to-end fashion. Our experiments show that our scheme outperforms recent OK-VQA systems with strong DPR for retrieval. We also introduce new diagnostic metrics to analyze how retrieval and generation interact. The strong retrieval ability of our model significantly reduces the number of retrieved documents needed in training, yielding significant benefits in answer quality and computation required for training.", "track": "Speech, Vision, Robotics, Multimodal Grounding", "label": 7}, {"loc": [7.232288837432861, 7.949219226837158], "id": 4415, "title": "Instance Regularization for Discriminative Language Model Pre-training", "authors": "Zhuosheng Zhang, Hai Zhao and Ming Zhou", "abstract": "Discriminative pre-trained language models (PrLMs) can be generalized as denoising auto-encoders that work with two procedures, ennoising and denoising. First, an ennoising process corrupts texts with arbitrary noising functions to construct training instances. Then, a denoising language model is trained to restore the corrupted tokens. Existing studies have made progress by optimizing independent strategies of either ennoising or denosing. They treat training instances equally throughout the training process, with little attention on the individual contribution of those instances. To model explicit signals of instance contribution, this work proposes to estimate the complexity of restoring the original sentences from corrupted ones in language model pre-training. The estimations involve the corruption degree in the ennoising data construction process and the prediction confidence in the denoising counterpart. Experimental results on natural language understanding and reading comprehension benchmarks show that our approach improves pre-training efficiency, effectiveness, and robustness. Code is publicly available at https://github.com/cooelf/InstanceReg.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [10.833569526672363, 6.820982456207275], "id": 4416, "title": "GuoFeng: A Benchmark for Zero Pronoun Recovery and Translation", "authors": "Mingzhou Xu, Longyue Wang, Derek F. Wong, Hongye Liu, Linfeng Song, Lidia S. Chao, Shuming Shi and Zhaopeng Tu", "abstract": "The phenomenon of zero pronoun (ZP) has attracted increasing interest in the machine translation (MT) community due to its importance and difficulty. However, previous studies generally evaluate the quality of translating ZPs with BLEU scores on MT testsets, which is not expressive or sensitive enough for accurate assessment. To bridge the data and evaluation gaps, we propose a benchmark testset for target evaluation on Chinese-English ZP translation. The human-annotated testset covers five challenging genres, which reveal different characteristics of ZPs for comprehensive evaluation. We systematically revisit eight advanced models on ZP translation and identify current challenges for future exploration. We release data, code, models and annotation guidelines, which we hope can significantly promote research in this field (https://github.com/longyuewangdcu/mZPRT).", "track": "Resources and Evaluation", "label": 1}, {"loc": [3.8936052322387695, 4.447811603546143], "id": 4418, "title": "ScienceWorld: Is your Agent Smarter than a 5th Grader?", "authors": "Ruoyao Wang, Peter Jansen, Marc-Alexandre C\u00f4t\u00e9 and Prithviraj Ammanabrolu", "abstract": "We present ScienceWorld, a benchmark to test agents' scientific reasoning abilities in a new interactive text environment at the level of a standard elementary school science curriculum. Despite the transformer-based progress seen in question-answering and scientific text processing, we find that current models cannot reason about or explain learned science concepts in novel contexts. For instance, models can easily answer what the conductivity of a known material is but struggle when asked how they would conduct an experiment in a grounded environment to find the conductivity of an unknown material. This begs the question of whether current models are simply retrieving answers by way of seeing a large number of similar examples or if they have learned to reason about concepts in a reusable manner. We hypothesize that agents need to be grounded in interactive environments to achieve such reasoning capabilities. Our experiments provide empirical evidence supporting this hypothesis -- showing that a 1.5 million parameter agent trained interactively for 100k steps outperforms a 11 billion parameter model statically trained for scientific question-answering and reasoning from millions of expert demonstrations.", "track": "NLP Applications", "label": 0}, {"loc": [6.53921365737915, 7.393195629119873], "id": 4421, "title": "Improving Embeddings Representations for Comparing Higher Education Curricula: A Use Case in Computing", "authors": "Jeffri Murrugarra-Llerena, Fernando Alva-Manchego and Nils Murrugarra-LLerena", "abstract": "We propose an approach for comparing curricula of study programs in higher education. Pre-trained word embeddings are fine-tuned in a study program classification task, where each curriculum is represented by the names and content of its courses. By combining metric learning with a novel course-guided attention mechanism, our method obtains more accurate curriculum representations than strong baselines. Experiments on a new dataset with curricula of computing programs demonstrate the intuitive power of our approach via attention weights, topic modeling, and embeddings visualizations. We also present a use case comparing computing curricula from USA and Latin America to showcase the capabilities of our improved embeddings representations.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [5.952273845672607, 5.208144187927246], "id": 4425, "title": "Mitigating Spurious Correlation in Natural Language Understanding with Counterfactual Inference", "authors": "Can Udomcharoenchaikit, Wuttikorn Ponwitayarat, Patomporn Payoungkhamdee, Kanruethai Masuk, Weerayut Buaphet, Ekapol Chuangsuwanich and Sarana Nutanong", "abstract": "Despite their promising results on standard benchmarks, NLU models are still prone to make predictions based on shortcuts caused by unintended bias in the dataset. For example, an NLI model may use lexical overlap as a shortcut to make entailment predictions due to repetitive data generation patterns from annotators, also called annotation artifacts. In this paper, we propose a causal analysis framework to help debias NLU models. We show that (1) by defining causal relationships, we can introspect how much annotation artifacts affect the outcomes. (2) We can utilize counterfactual inference to mitigate bias with this knowledge. We found that viewing a model as a treatment can mitigate bias more effectively than viewing annotation artifacts as treatment. (3) In addition to bias mitigation, we can interpret how much each debiasing strategy is affected by annotation artifacts. Our experimental results show that using counterfactual inference can improve out-of-distribution performance in all settings while maintaining high in-distribution performance.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [6.557802200317383, 12.023564338684082], "id": 4430, "title": "Federated Meta-Learning for Emotion and Sentiment Aware Multi-modal Complaint Identification", "authors": "Apoorva Singh, C Siddarth, Sriparna Saha and Tanmay Sen", "abstract": "", "track": "NLP Applications", "label": 0}, {"loc": [3.3298754692077637, 7.8765997886657715], "id": 4431, "title": "End-to-End Neural Discourse Deixis Resolution in Dialogue", "authors": "Shengjie Li and Vincent Ng", "abstract": "We adapt Lee et al.'s (2018) span-based entity coreference model to the task of end-to-end discourse deixis resolution in dialogue, specifically by proposing extensions to their model that exploit task-specific characteristics. The resulting model, dd-utt, achieves state-of-the-art results on the four datasets in the CODI-CRAC 2021 shared task.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [6.224602222442627, 5.552917957305908], "id": 4442, "title": "Balancing out Bias: Achieving Fairness Through Balanced Training", "authors": "Xudong Han, Timothy Baldwin and Trevor Cohn", "abstract": "Group bias in natural language processing tasks manifests as disparities in system error rates across texts authorized by different demographic groups, typically disadvantaging minority groups. Dataset balancing has been shown to be effective at mitigating bias, however existing approaches do not directly account for correlations between author demographics and linguistic variables, limiting their effectiveness. To achieve Equal Opportunity fairness, such as equal job opportunity without regard to demographics, this paper introduces a simple, but highly effective, objective for countering bias using balanced training.\nWe extend the method in the form of a gated model, which incorporates protected attributes as input, and show that it is effective at reducing bias in predictions through demographic input perturbation, outperforming all other bias mitigation techniques when combined with balanced training.", "track": "Ethics", "label": 21}, {"loc": [8.093547821044922, 9.465882301330566], "id": 4449, "title": "Prompting ELECTRA: Few-Shot Learning with Discriminative Pre-Trained Models", "authors": "Mengzhou Xia, Mikel Artetxe, Jingfei Du, Danqi Chen and Veselin Stoyanov", "abstract": "Pre-trained masked language models successfully perform few-shot learning by formulating downstream tasks as text infilling. How- ever, as a strong alternative in full-shot settings, discriminative pre-trained models like ELECTRA do not fit into the paradigm. In this work, we adapt prompt-based few-shot learning to ELECTRA and show that it outperforms masked language models in a wide range of tasks. ELECTRA is pre-trained to distinguish if a token is generated or original. We naturally extend that to prompt-based few-shot learning by training to score the originality of the target options without introducing new parameters. Our method can be easily adapted to tasks involving multi-token predictions without extra computation overhead. Analysis shows that ELECTRA learns distributions that align better with downstream tasks.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [4.948933124542236, 3.554842472076416], "id": 4450, "title": "Identifying Physical Object Use in Sentences", "authors": "Tianyu Jiang and Ellen Riloff", "abstract": "Commonsense knowledge about the typical\nfunctions of physical objects allows people to\nmake inferences during sentence understanding.\nFor example, we infer that \"Sam enjoyed\nthe book\" means that Sam enjoyed reading the\nbook, even though the action is implicit. Prior\nresearch has focused on learning the prototypical\nfunctions of physical objects in order to\nenable inferences about implicit actions. But\nmany sentences refer to objects even when they\nare not used (e.g., \"The book fell\"). We argue\nthat NLP systems need to recognize whether an\nobject is being used before inferring how the\nobject is used. We define a new task called Object\nUse Classification that determines whether\na physical object mentioned in a sentence was\nused or likely will be used. We introduce a new\ndataset for this task and present a classification\nmodel that exploits data augmentation methods\nand FrameNet when fine-tuning a pre-trained\nlanguage model. We also show that object use\nclassification combined with knowledge about\nthe prototypical functions of objects has the\npotential to yield very good inferences about\nimplicit and anticipated actions.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [4.290769100189209, 7.503115177154541], "id": 4452, "title": "CDialog: A Multi-turn Covid-19 Conversation Dataset for Entity-Aware Dialog Generation", "authors": "Deeksha Varshney, Aizan Zafar, Niranshu Kumar Behera and Asif Ekbal", "abstract": "", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [5.941260814666748, 5.118440628051758], "id": 4454, "title": "Robustifying Sentiment Classification by Maximally Exploiting Few Counterfactuals", "authors": "Maarten De Raedt, Fr\u00e9deric Godin, Chris Develder and Thomas Demeester", "abstract": "For text classification tasks, finetuned language models perform remarkably well. Yet, they tend to rely on spurious patterns in training data, thus limiting their performance on out-of-distribution (OOD) test data. Among recent models aiming to avoid this spurious pattern problem, adding extra counterfactual samples to the training data has proven to be very effective. Yet, counterfactual data generation is costly since it relies on human annotation. Thus, we propose a novel solution that only requires annotation of a small fraction (e.g., 1%) of the original training data, and uses automatic generation of extra counterfactuals in an encoding vector space. We demonstrate the effectiveness of our approach in sentiment classification, using IMDb data for training and other sets for OOD tests (i.e., Amazon, SemEval and Yelp). We achieve noticeable accuracy improvements by adding only 1% manual counterfactuals: +3% compared to adding +100% in-distribution training samples, +1.3% compared to alternate counterfactual approaches.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.957622051239014, 9.10991096496582], "id": 4455, "title": "Data-Efficient Playlist Captioning With Musical and Linguistic Knowledge", "authors": "Giovanni Gabbolini, Romain Hennequin and Elena V. Epure", "abstract": "Music streaming services feature billions of playlists created by users, professional editors or algorithms. In this content overload scenario, it is crucial to characterise playlists, so that music can be effectively organised and accessed. Playlist titles and descriptions are proposed in natural language either manually by music editors and users or automatically from pre-defined templates. However, the former is time-consuming while the latter is limited by the vocabulary and covered music themes. \nIn this work, we propose PlayNTell, a data-efficient multi-modal encoder-decoder model for automatic playlist captioning. Compared to existing music captioning algorithms, PlayNTell leverages also linguistic and musical knowledge to generate correct and thematic captions. We benchmark PlayNTell on a new editorial playlists dataset collected from two major music streaming services.\nPlayNTell yields 2x-3x higher BLEU@4 and CIDEr than state of the art captioning algorithms.", "track": "NLP Applications", "label": 0}, {"loc": [10.811177253723145, 9.330028533935547], "id": 4457, "title": "Improved grammatical error correction by ranking elementary edits", "authors": "Alexey Sorokin", "abstract": "We offer a two-stage reranking method for grammatical error correction: the first model serves as edit generator, while the second classifies the proposed edits as correct or false. We show how to use both encoder-decoder and sequence labeling models for the first step of our pipeline. We achieve state-of-the-art quality on BEA 2019 English dataset even using weak BERT-GEC edit generator. Combining our roberta-base scorer with state-of-the-art GECToR edit generator, we surpass GECToR by 2-3\\%. With a larger model we establish a new SOTA on BEA development and test sets. Our model also sets a new SOTA on Russian, despite using smaller models and less data than the previous approaches.", "track": "NLP Applications", "label": 0}, {"loc": [8.523696899414062, 6.956820964813232], "id": 4459, "title": "Improving Tokenisation by Alternative Treatment of Spaces", "authors": "Edward Gow-Smith, Harish Tayyar Madabushi, Carolina Scarton and Aline Villavicencio", "abstract": "", "track": "Phonology, Morphology and Word Segmentation", "label": 25}, {"loc": [5.485447406768799, 8.540998458862305], "id": 4463, "title": "GENIE: Toward Reproducible and Standardized Human Evaluation for Text Generation", "authors": "Daniel Khashabi, Gabriel Stanovsky, Jonathan Bragg, Nicholas Lourie, Jungo Kasai, Yejin Choi, Noah A. Smith and Daniel Weld", "abstract": "While often assumed a gold standard, effective human evaluation of text generation remains an important, open area for research.\nWe revisit this problem with a focus on producing consistent evaluations that are reproducible---over time and across different populations. \nWe study this goal in different stages of the human evaluation pipeline. \nIn particular, we consider design choices for the annotation interface used to elicit human judgments and their impact on reproducibility. \nFurthermore, we develop an automated mechanism for maintaining annotator quality via a probabilistic model that detects and excludes noisy annotators. \nPutting these lessons together, we introduce GENIE: a system for running standardized human evaluations across different generation tasks.\nWe instantiate GENIE with datasets representing four core challenges in text generation: machine translation, summarization, commonsense reasoning, and machine comprehension.\nFor each task, GENIE offers a leaderboard that automatically crowdsources annotations for submissions, evaluating them along axes such as correctness, conciseness, and fluency.\nWe have made the GENIE leaderboards publicly available, and have already ranked 50 submissions from 10 different research groups. We hope GENIE encourages further progress toward effective, standardized evaluations for text generation.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.086878776550293, 5.376801490783691], "id": 4464, "title": "Attentional Probe: Estimating a Module's Functional Potential", "authors": "Tiago Pimentel, Josef Valvoda, Niklas Stoehr and Ryan Cotterell", "abstract": "", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [7.174093723297119, 6.870387554168701], "id": 4466, "title": "When More Data Hurts: A Troubling Quirk in Developing Broad-Coverage Natural Language Understanding Systems", "authors": "Elias Stengel-Eskin, Emmanouil Antonios Platanios, Adam Pauls, Sam Thomson, Hao Fang, Benjamin Van Durme, Jason Eisner and Yu Su", "abstract": "In natural language understanding (NLU) production systems, users' evolving needs necessitate the addition of new features over time, indexed by new symbols added to the meaning representation space. This requires additional training data and results in ever-growing datasets. We present the first systematic investigation into this incremental symbol learning scenario. Our analysis reveals a troubling quirk in building broad-coverage NLU systems: as the training dataset grows, performance on a small set of new symbols often decreases. We show that this trend holds for multiple mainstream models on two common NLU tasks: intent recognition and semantic parsing. Rejecting class imbalance as the sole culprit, we reveal that the trend is closely associated with an effect we call source signal dilution, where strong lexical cues for the new symbol become diluted as the training dataset grows. Selectively dropping training examples to prevent dilution often reverses the trend, showing the over-reliance of mainstream neural NLU models on simple lexical cues.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [9.129129409790039, 6.927538871765137], "id": 4469, "title": "Zero-shot Cross-lingual Transfer of Prompt-based Tuning with a Unified Multilingual Prompt", "authors": "Lianzhe Huang, Shuming Ma, Dongdong Zhang, Furu Wei and Houfeng Wang", "abstract": "Prompt-based tuning has been proven effective for pretrained language models (PLMs). While most of the existing work focuses on the monolingual prompts, we study the multilingual prompts for multilingual PLMs, especially in the zero-shot cross-lingual setting. To alleviate the effort of designing different prompts for multiple languages, we propose a novel model that uses a unified prompt for all languages, called UniPrompt. Different from the discrete prompts and soft prompts, the unified prompt is model-based and language-agnostic. Specifically, the unified prompt is initialized by a multilingual PLM to produce language-independent representation, after which is fused with the text input. During inference, the prompts can be pre-computed so that no extra computation cost is needed. To collocate with the unified prompt, we propose a new initialization method for the target label word to further improve the model's transferability across languages. Extensive experiments show that our proposed methods can significantly outperform the strong baselines across different languages. We release data and code to facilitate future research.", "track": "Multilinguality", "label": 13}, {"loc": [6.477760314941406, 7.508520126342773], "id": 4473, "title": "Three Real-World Datasets and Neural Computational Models for Classification Tasks in Patent Landscaping", "authors": "Subhash C. Pujari, Jannik Str\u00f6tgen, Mark Giereth, Michael Gertz and Annemarie Friedrich", "abstract": "Patent Landscaping, one of the central tasks of intellectual property management, includes selecting and grouping patents according to user-defined technical or application-oriented criteria. While recent transformer-based models have been shown to be effective for classifying patents into taxonomies such as CPC or IPC, there is yet little research on how to support real-world Patent Landscape Studies (PLSs) using natural language processing methods. With this paper, we release three labeled datasets for PLS-oriented classification tasks covering two diverse domains. We provide a qualitative analysis and report detailed corpus statistics.\n\nMost research on neural models for patents has been restricted to leveraging titles and abstracts. We compare strong neural and non-neural baselines, proposing a novel model that takes into account textual information from the patents' full texts as well as embeddings created based on the patents' CPC labels. We find that for PLS-oriented classification tasks, going beyond title and abstract is crucial, CPC labels are an effective source of information, and combining all features yields the best results.", "track": "Resources and Evaluation", "label": 1}, {"loc": [6.893301963806152, 9.926328659057617], "id": 4474, "title": "Topic Modeling With Topological Data Analysis", "authors": "Ciar\u00e1n Byrne, Danijela Horak, Karo Moilanen and Amandla Mabona", "abstract": "Recent unsupervised topic modelling ap-\nproaches that use clustering techniques on\nword, token or document embeddings can ex-\ntract coherent topics. A common limitation\nof such approaches is that they reveal noth-\ning about inter-topic relationships which are\nessential in many real-world application do-\nmains. We present an unsupervised topic mod-\nelling method which harnesses Topological\nData Analysis (TDA) to extract a topological\nskeleton of the manifold upon which contextu-\nalised word embeddings lie. We demonstrate\nthat our approach, which performs on par with\na recent baseline, is able to construct a network\nof coherent topics together with meaningful\nrelationships between them.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [8.490314483642578, 8.280784606933594], "id": 4484, "title": "Predicting Fine-Tuning Performance with Probing", "authors": "Zining Zhu, Soroosh Shahtalebi and Frank Rudzicz", "abstract": "Large NLP models have recently shown impressive performance in language understanding tasks, typically evaluated by their fine-tuned performance. Alternatively, probing has received increasing attention as being a lightweight method for interpreting the intrinsic mechanisms of large NLP models. In probing, post-hoc classifiers are trained on \"out-of-domain\" datasets that diagnose specific abilities. While probing the language models has led to insightful findings, they appear disjointed from the development of models. This paper explores the utility of probing deep NLP models to extract a proxy signal widely used in model development -- the fine-tuning performance. We find that it is possible to use the accuracies of only three probing tests to predict the fine-tuning performance with errors 40% - 80% smaller than baselines. We further discuss possible avenues where probing can empower the development of deep NLP models.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [1.3252894878387451, 4.922028064727783], "id": 4493, "title": "Diverse Parallel Data Synthesis for Cross-Database Adaptation of Text-to-SQL Parsers", "authors": "Abhijeet Awasthi, Ashutosh Sathe and Sunita Sarawagi", "abstract": "Text-to-SQL parsers often struggle with database schemas unseen during the train time. Adapting Text-to-SQL parsers to new schemas is a challenging problem owing to a huge diversity of schemas and zero availability of natural language queries in the new schema. We present ReFill, a framework for synthesizing high-quality and textually diverse parallel datasets for adapting Text-to-SQL parsers. Unlike the prior SQL-to-Text generation methods, ReFill learns to retrieve-and-edit text queries from existing schemas and transfer them to the new schema. ReFill uses a novel method of retrieving diverse existing text, masking their schema-specific tokens, and refilling with tokens relevant to the target schema. We show that this process leads to significantly more diverse text queries than achievable by a standard SQL-to-Text generation model. Through experiments on several databases, we show that adapting a parser by finetuning on datasets synthesized using ReFill consistently outperforms prior data-augmentation methods.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [2.225458860397339, 7.405486583709717], "id": 4494, "title": "Agent-Specific Deontic Modality Detection in Legal Language", "authors": "Abhilasha Sancheti, Aparna Garimella, Balaji Vasan Srinivasan and Rachel Rudinger", "abstract": "Legal documents are typically long and written in legalese, which makes it particularly difficult for laypeople to understand their rights and duties. While natural language understanding technologies can be valuable in supporting such understanding in the legal domain, the limited availability of datasets annotated for deontic modalities in the legal domain, due to the cost of hiring experts and privacy issues, is a bottleneck. To this end, we introduce, LEXDEMOD, a corpus of English contracts annotated\nwith deontic modality expressed with respect to a contracting party or agent along with the modal triggers. We benchmark this dataset on two tasks: (i) agent-specific multi-label deontic modality classification, and (ii) agent-specific deontic modality and trigger span detection using Transformer-based (Vaswani et al., 2017) language models. Transfer learning experiments show that the linguistic diversity of modal expressions in LEXDEMOD generalizes reasonably from lease to employment and\nrental agreements. A small case study indicates that a model trained on LEXDEMOD can detect red flags with high recall. We believe our work offers a new research direction for deontic modality detection in the legal domain.", "track": "Resources and Evaluation", "label": 1}, {"loc": [7.592691898345947, 12.30020809173584], "id": 4495, "title": "COLD: A Benchmark for Chinese Offensive Language Detection", "authors": "Jiawen Deng, Jingyan ZHOU, Hao Sun, Chujie Zheng, Fei Mi, Helen Meng and Minlie Huang", "abstract": "Offensive language detection is increasingly crucial for maintaining a civilized social media platform and deploying pre-trained language models. However, this task in Chinese is still under exploration due to the scarcity of reliable datasets. To this end, we propose a benchmark --COLD for Chinese offensive language analysis, including a Chinese Offensive Language Dataset --COLDATASET and a baseline detector --COLDETECTOR which is trained on the dataset. We show that the COLD benchmark contributes to Chinese offensive language detection which is challenging for existing resources. We then deploy the COLDETECTOR and conduct detailed analyses on popular Chinese pre-trained language models. We first analyze the offensiveness of existing generative models and show that these models inevitably expose varying degrees of offensive issues. Furthermore, we investigate the factors that influence the offensive generations, and we find that anti-bias contents and keywords referring to certain groups or revealing negative attitudes trigger offensive outputs easier.", "track": "Ethics", "label": 21}, {"loc": [6.366693496704102, 2.063981533050537], "id": 4505, "title": "Fixing Model Bugs with Natural Language Patches", "authors": "Shikhar Murty, Christopher D. Manning, Scott Lundberg and Marco Tulio Ribeiro", "abstract": "Current approaches for fixing systematic problems in NLP models (e.g., regex patches, finetuning on more data) are either brittle, or labor-intensive and liable to shortcuts. In contrast, humans often provide corrections to each other through natural language. Taking inspiration from this, we explore natural language patches---declarative statements that allow developers to provide corrective feedback at the right level of abstraction, either overriding the model (``if a review gives 2 stars, the sentiment is negative'') or providing additional information the model may lack (``if something is described as the bomb, then it is good''). We model the task of determining if a patch applies separately from the task of integrating patch information, and show that with a small amount of synthetic data, we can teach models to effectively use real patches on real data---1 to 7 patches improve accuracy by ~1--4 accuracy points on different slices of a sentiment analysis dataset, and F1 by 7 points on a relation extraction dataset. Finally, we show that finetuning on as many as 100 labeled examples may be needed to match the performance of a small set of language patches.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [7.921070575714111, 3.2946479320526123], "id": 4508, "title": "WeDef: Weakly Supervised Backdoor Defense for Text Classification", "authors": "Lesheng Jin, Zihan Wang and Jingbo Shang", "abstract": "Existing backdoor defense methods are only effective for limited trigger types. To defend different trigger types at once, we start from the class-irrelevant nature of the poisoning process and propose a novel weakly supervised backdoor defense framework WeDef. Recent advances in weak supervision make it possible to train a reasonably accurate text classifier using only a small number of user-provided, class-indicative seed words. Such seed words shall be considered independent of the triggers. Therefore, a weakly supervised text classifier trained by only the poisoned documents without their labels will likely have no backdoor. Inspired by this observation, in WeDef, we define the reliability of samples based on whether the predictions of the weak classifier agree with their labels in the poisoned training set. We further improve the results through a two-phase sanitization: (1) iteratively refine the weak classifier based on the reliable samples and (2) train a binary poison classifier by distinguishing the most unreliable samples from the most reliable samples. Finally, we train the sanitized model on the samples that the poison classifier predicts as benign. Extensive experiments show that WeDef is effective against popular trigger-based attacks (e.g., words, sentences, and paraphrases), outperforming existing defense methods.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [6.059531211853027, 5.279592514038086], "id": 4512, "title": "Interventional Training for Out-Of-Distribution Natural Language Understanding", "authors": "Sicheng Yu, Jing Jiang, Hao Zhang, Yulei Niu, Qianru Sun and Lidong Bing", "abstract": "Out-of-distribution (OOD) settings are used to measure a model's performance when the distribution of the test data is different from that of the training data. NLU models are known to suffer in OOD. We study this issue from the perspective of causality, which sees confounding bias as the reason for models to learn spurious correlations. While a common solution is to perform intervention, existing methods handle only known and single confounder, but in many NLU tasks the confounders can be both unknown and multifactorial. In this paper, we propose a novel interventional training method called Bottom-up Automatic Intervention (BAI) that performs multi-granular intervention with identified multifactorial confounders. Our experiments on three NLU tasks, namely, natural language inference, fact verification and paraphrase identification, show the effectiveness of BAI for tackling OOD settings.", "track": "Machine Learning for NLP", "label": 3}, {"loc": [1.740082859992981, 3.8306288719177246], "id": 4514, "title": "Pseudo-Relevance for Enhancing Document Representation", "authors": "Jihyuk Kim, Seung-won Hwang, Seoho Song, Hyeseon Ko and Young-In Song", "abstract": "This paper studies how to enhance the document representation for the bi-encoder approach in dense document retrieval. The bi-encoder, separately encoding a query and a document as a single vector, is favored for high efficiency in large-scale information retrieval, compared to more effective but complex architectures. To combine the strength of the two, the multi-vector representation of documents for bi-encoder, such as ColBERT preserving all token embeddings, has been widely adopted. Our contribution is to reduce the size of the multi-vector representation, without compromising the effectiveness, supervised by query logs. Our proposed solution decreases the latency and the memory footprint, up to 8- and 3-fold, validated on MSMARCO and real-world search query logs.", "track": "Information Retrieval and Text Mining", "label": 15}, {"loc": [7.7016825675964355, 9.640179634094238], "id": 4515, "title": "ZeroGen: Efficient Zero-shot Learning via Dataset Generation", "authors": "Jiacheng Ye, Jiahui Gao, Qintong Li, Hang XU, Jiangtao Feng, Zhiyong Wu, Tao Yu and Lingpeng Kong", "abstract": "There is a growing interest in dataset generation recently due to the superior generative capacity of large pre-trained language models (PLMs). In this paper, we study a flexible and efficient zero-short learning method, ZeroGen.\nGiven a zero-shot task, we first generate a dataset from scratch using PLMs in an unsupervised manner. Then, we train a tiny task model (e.g., LSTM) under the supervision of the synthesized dataset. This approach allows highly efficient inference as the final task model only has orders of magnitude fewer parameters comparing to PLMs (e.g., GPT2-XL).\nApart from being annotation-free and efficient, we argue that ZeroGen can also provide useful insights from the perspective of data-free model-agnostic knowledge distillation, and unreferenced text generation evaluation. \nExperiments and analysis on different NLP tasks, namely, text classification, question answering, and natural language inference, show the effectiveness of ZeroGen.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [6.472930908203125, 7.461972713470459], "id": 4525, "title": "Neighborhood Contrastive Learning for Scientific Document Representations with Citation Embeddings", "authors": "Malte Ostendorff, Nils Rethmeier, Isabelle Augenstein, Bela Gipp and Georg Rehm", "abstract": "Learning scientific document representations can be substantially improved through contrastive learning objectives, where the challenge lies in creating positive and negative training samples that encode the desired similarity semantics. Prior work relies on discrete citation relations to generate contrast samples. However, discrete citations enforce a hard cut-off to similarity. This is counter-intuitive to similarity-based learning and ignores that scientific papers can be very similar despite lacking a direct citation - a core problem of finding related research. Instead, we use controlled nearest neighbor sampling over citation graph embeddings for contrastive learning. This control allows us to learn continuous similarity, to sample hard-to-learn negatives and positives, and also to avoid collisions between negative and positive samples by controlling the sampling margin between them. The resulting method SciNCL outperforms the state-of-the-art on the SciDocs benchmark. Furthermore, we demonstrate that it can train (or tune) language models sample-efficiently and that it can be combined with recent training-efficient methods. Perhaps surprisingly, even training a general-domain language model this way outperforms baselines pretrained in-domain.", "track": "NLP Applications", "label": 0}, {"loc": [5.019529342651367, 3.7464747428894043], "id": 4528, "title": "SPE: Symmetrical Prompt Enhancement for Fact Probing", "authors": "Yiyuan Li, Tong Che, Yezhen Wang, Zhengbao Jiang, Caiming Xiong and Snigdha Chaturvedi", "abstract": "Pretrained language models (PLMs) have been shown to accumulate factual knowledge during pretraining (Petroni et al. 2019). Recent works probe PLMs for the extent of this knowledge through prompts either in discrete or continuous forms. However, these methods do not consider symmetry of the task: object prediction and subject prediction. In this work, we propose Symmetrical Prompt Enhancement (SPE), a continuous prompt-based method for factual probing in PLMs that leverages the symmetry of the task by constructing symmetrical prompts for subject and object prediction. Our results on a popular factual probing dataset, LAMA, show significant improvement of SPE over previous probing methods.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [8.537823677062988, 8.273076057434082], "id": 4535, "title": "Efficient Large Scale Language Modeling with Mixtures of Experts", "authors": "Mikel Artetxe, Shruti Bhosale, Naman Goyal, Todor Mihaylov, Myle Ott, Sam Shleifer, Xi Victoria Lin, Jingfei Du, Srinivasan Iyer, Ramakanth Pasunuru, Giridharan Anantharaman, Xian Li, Shuohui Chen, Halil Akin, Mandeep Baines, Louis Rapha\u00ebl Th\u00e9o Martin, Xing Zhou, Punit Singh Koura, Brian O'Horo, Jeffrey Wang, Luke Zettlemoyer, Mona Diab, Zornitsa Kozareva and Veselin Stoyanov", "abstract": "Mixture of Experts layers (MoEs) enable efficient scaling of language models through conditional computation. This paper presents a detailed empirical study of how autoregressive MoE language models scale in comparison with dense models in a wide range of settings: in- and out-of-domain language modeling, zero- and few-shot priming, and full-shot fine-tuning. With the exception of fine-tuning, we find MoEs to be substantially more compute efficient. At more modest training budgets, MoEs can match the performance of dense models using ~4 times less compute. This gap narrows at scale, but our largest MoE model (1.1T parameters) consistently outperforms a compute-equivalent dense model (6.7B parameters). Overall, this performance gap varies greatly across tasks and domains, suggesting that MoE and dense models generalize differently in ways that are worthy of future study. We make our code and models publicly available for research use.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [2.5252745151519775, 8.720478057861328], "id": 4546, "title": "MedJEx: A Medical Jargon Extraction Model with Wiki's Hyperlink Span and Contextualized Masked Language Model Score", "authors": "Sunjae Kwon, Zonghai Yao, Harmon S. Jordan, David A. Levy, Brian Corner and hong yu", "abstract": "This paper proposes a new natural language processing (NLP) application for identifying medical jargon terms potentially difficult for patients to comprehend from electronic health record (EHR) notes. We first present a novel and publicly available dataset with expert-annotated medical jargon terms from 18K+ EHR note sentences (MedJ). Then, we introduce a novel medical jargon extraction (MedJEx) model which has been shown to outperform existing state-of-the-art NLP models. First, MedJEx improved the overall performance when it was trained on an auxiliary Wikipedia hyperlink span dataset, where hyperlink spans provide additional Wikipedia articles to explain the spans (or terms), and then fine-tuned on the annotated MedJ data. Secondly, we found that a contextualized masked language model score was beneficial for detecting domain-specific unfamiliar jargon terms. Moreover, our results show that training on the auxiliary Wikipedia hyperlink span datasets improved six out of eight biomedical named entity recognition benchmark datasets. MedJEx is publicly available.", "track": "NLP Applications", "label": 0}, {"loc": [3.6228268146514893, 5.14925479888916], "id": 4548, "title": "Discourse Comprehension: A Question Answering Framework to Represent Sentence Connections", "authors": "Wei-Jen Ko, Cutter J. Dalton, Mark Joseph Simmons, Eliza Fisher, Greg Durrett and Junyi Jessy Li", "abstract": "While there has been substantial progress in text comprehension through simple factoid question answering, more holistic comprehension of a discourse still presents a major challenge (Dunietz et al., 2020). Someone critically reflecting on a text as they read it will pose curiosity-driven, often open-ended questions, which reflect deep understanding of the content and require complex reasoning to answer (Ko et al., 2020; Westera et al., 2020). A key challenge in building and evaluating models for this type of discourse comprehension is the lack of annotated data, especially since collecting answers to such questions requires high cognitive load for annotators.\n\nThis paper presents a novel paradigm that enables scalable data collection targeting the comprehension of news documents, viewing these questions through the lens of discourse. The resulting corpus, DCQA (Discourse Comprehension by Question Answering), captures both discourse and semantic links between sentences in the form of free-form, open-ended questions. On an evaluation set that we annotated on questions from Ko et al. (2020), we show that DCQA provides valuable supervision for answering open-ended questions. We additionally design pre-training methods utilizing existing question-answering resources, and use synthetic data to accommodate unanswerable questions.", "track": "Discourse and Pragmatics", "label": 24}, {"loc": [4.112943649291992, 9.315643310546875], "id": 4551, "title": "Learning to Generate Overlap Summaries through Noisy Synthetic Data", "authors": "Naman Bansal, Mousumi Akter and Shubhra Kanti Karmaker", "abstract": "Semantic Overlap Summarization (SOS) is a novel and relatively under-explored seq-to-seq task which entails summarizing common information from multiple alternate narratives. One of the major challenges for solving this task is the lack of existing datasets for supervised training. To address this challenge, we propose a novel data augmentation technique, which allows us to create large amount of synthetic data for training a seq-to-seq model that can perform the SOS task. Through extensive experiments using narratives from the news domain, we show that the models fine-tuned using the synthetic dataset provide significant performance improvements over the pre-trained vanilla summarization techniques and are close to the models fine-tuned on the golden training data; which essentially demonstrates the effectiveness of out proposed data augmentation technique for training seq-to-seq models on the SOS task.", "track": "Summarization", "label": 14}, {"loc": [7.34815788269043, 7.068229675292969], "id": 4554, "title": "Mutual Exclusivity Training and Primitive Augmentation to Induce Compositionality", "authors": "Yichen Jiang, Xiang Zhou and Mohit Bansal", "abstract": "Recent datasets expose the lack of the systematic generalization ability in standard sequence-to-sequence models. In this work, we analyze this behavior of seq2seq models and identify two contributing factors: a lack of mutual exclusivity bias (one target sequence can only be mapped to one source sequence), and the tendency to memorize whole examples rather than separating structures from contents. We propose two techniques to address these two issues respectively: Mutual Exclusivity Training that prevents the model from producing seen generations when facing novel examples via an unlikelihood-based loss, and prim2primX data augmentation that automatically diversifies the arguments of every syntactic function to prevent memorizing and provide a compositional inductive bias without exposing test-set data. Combining these two techniques, we show substantial empirical improvements using standard sequence-to-sequence models (LSTMs and Transformers) on two widely-used compositionality datasets: SCAN and COGS. Finally, we provide analysis characterizing the improvements as well as the remaining challenges, and provide detailed ablations of our method.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [7.591396808624268, 12.346772193908691], "id": 4560, "title": "Directions for NLP Practices Applied to Online Hate Speech Detection", "authors": "Paula Fortuna, Monica Dominguez, Leo Wanner and Zeerak Talat", "abstract": "Addressing hate speech in online spaces has been conceptualized as a classification task that uses Natural Language Processing (NLP) techniques. Through this conceptualization, the hate speech detection task has relied on common conventions and practices from NLP. For instance, inter-annotator agreement is conceptualized as a way to measure dataset quality and certain metrics and benchmarks are used to assure model generalization. However, hate speech is a deeply complex and situated concept that eludes such static and disembodied practices. In this position paper, we critically reflect on these methodologies for hate speech detection, we argue that many conventions in NLP are poorly suited for the problem and encourage researchers to develop methods that are more appropriate for the task.", "track": "Theme Track", "label": 18}, {"loc": [2.9587690830230713, 4.6016526222229], "id": 4562, "title": "Pre-training Transformer Models with Sentence-Level Objectives for Answer Sentence Selection", "authors": "Luca Di Liello, Siddhant Garg, Luca Soldaini and Alessandro Moschitti", "abstract": "An important task for designing QA systems is answer sentence selection (AS2): selecting the sentence containing (or constituting) the answer to a question from a set of retrieved relevant documents. In this paper, we propose three novel sentence-level transformer pre-training objectives that incorporate paragraph-level semantics within and across documents, to improve the performance of transformers for AS2, and mitigate the requirement of large labeled datasets. Specifically, the model is tasked to predict whether: (i) two sentences are extracted from the same paragraph, (ii) a given sentence is extracted from a given paragraph, and (iii) two paragraphs are extracted from the same document. Our experiments on three public and one industrial AS2 datasets demonstrate the empirical superiority of our pre-trained transformers over baseline models such as RoBERTa and ELECTRA for AS2.", "track": "Question Answering", "label": 11}, {"loc": [3.725797653198242, 4.802581310272217], "id": 4564, "title": "OpenCQA: Open-ended Question Answering with Charts", "authors": "Shankar Kantharaj, Xuan Long Do, Rixie Tiffany Leong, Jia Qing Tan, Enamul Hoque and Shafiq Joty", "abstract": "Charts are very popular to analyze data and convey important insights. People often analyze visualizations to answer open-ended questions that require explanatory answers. Answering such questions are often difficult and time-consuming as it requires a lot of cognitive and perceptual efforts. To address this challenge, we introduce a new task called OpenCQA, where the goal is to answer an open-ended question about a chart with descriptive texts. We present the annotation process and an in-depth analysis of our dataset. We implement and evaluate a set of baselines under three practical settings. In the first setting, a chart and the accompanying article is provided as input to the model. The second setting provides only the relevant paragraph(s) to the chart instead of the entire article, whereas the third setting requires the model to generate an answer solely based on the chart. Our analysis of the results show that the top performing models generally produce fluent and coherent text while they struggle to perform complex logical and arithmetic reasoning.", "track": "Question Answering", "label": 11}, {"loc": [4.856136322021484, 3.4252820014953613], "id": 4566, "title": "A Systematic Investigation of Commonsense Knowledge in Large Language Models", "authors": "Xiang Lorraine Li, Adhiguna Kuncoro, Jordan Hoffmann, Cyprien de Masson d'Autume, Phil Blunsom and Aida Nematzadeh", "abstract": "Language models (LMs) trained on large amounts of data have shown impressive performance on many NLP tasks under the zero-shot and few-shot setup. Here we aim to better understand the extent to which such models learn commonsense knowledge --- a critical component of many NLP applications. We conduct a systematic and rigorous zero-shot and few-shot commonsense evaluation of large pre-trained LMs, where we: (i) carefully control for the LMs' ability to exploit potential surface cues and annotation artefacts, and (ii) account for variations in performance that arise from factors that are not related to commonsense knowledge. Our findings highlight the limitations of pre-trained LMs in acquiring commonsense knowledge without task-specific supervision; furthermore, using larger models or few-shot evaluation is insufficient to achieve human-level commonsense performance.", "track": "Commonsense Reasoning", "label": 19}, {"loc": [8.981797218322754, 7.146712779998779], "id": 4573, "title": "Transforming Sequence Tagging Into A Seq2Seq Task", "authors": "Karthik Raman, Iftekhar Naim, Jiecao Chen, Kazuma Hashimoto, Kiran Yalasangi and Krishna Srinivasan", "abstract": "Pretrained, large, generative language models (LMs) have had great success in a wide range of sequence tagging and structured prediction tasks. Casting a sequence tagging task as a Seq2Seq one requires deciding the formats of the input and output sequences. However, we lack a principled understanding of the trade-offs associated with these formats (such as the effect on model accuracy, sequence length, multilingual generalization, hallucination). In this paper, we rigorously study different formats one could use for casting input text sentences and their output labels into the input and target (i.e., output) of a Seq2Seq model. Along the way, we introduce a new format, which we show to to be both simpler and more effective. Additionally the new format demonstrates significant gains in the multilingual settings -- both zero-shot transfer learning and joint training. Lastly, we find that the new format is more robust and almost completely devoid of hallucination -- an issue we find common in existing formats. With well over a 1000 experiments studying 14 different formats, over 7 diverse public benchmarks -- including 3 multilingual datasets spanning 7 languages -- we believe our findings provide a strong empirical basis in understanding how we should tackle sequence tagging tasks.", "track": "Multilinguality", "label": 13}, {"loc": [2.2859129905700684, 4.920640468597412], "id": 4576, "title": "CycleKQR: Unsupervised Bidirectional Keyword-Question Rewriting", "authors": "Andrea Iovine, Anjie Fang, Besnik Fetahu, Jie Zhao, Oleg Rokhlenko and Shervin Malmasi", "abstract": "Users expect their queries to be answered by search systems, regardless of the query's surface form, which include keyword queries and natural questions. Natural Language Understanding (NLU) components of Search and QA systems may fail to correctly interpret semantically equivalent inputs if this deviates from how the system was trained, leading to suboptimal understanding capabilities. We propose the keyword-question rewriting task to improve query understanding capabilities of NLU systems for all surface forms. To achieve this, we present CycleKQR, an unsupervised approach, enabling effective rewriting between keyword and question queries using non-parallel data.\n\nEmpirically we show the impact on QA performance of unfamiliar query forms for open domain and Knowledge Base QA systems (trained on either keywords or natural language questions). We demonstrate how CycleKQR significantly improves QA performance by rewriting queries into the appropriate form, while at the same time retaining the original semantic meaning of input queries, allowing CycleKQR to improve performance by up to 3% over supervised baselines. Finally, we release a dataset\nof 66k keyword-question pairs.", "track": "Unsupervised and Weakly-Supervised Methods in NLP", "label": 17}, {"loc": [4.82905912399292, 9.073969841003418], "id": 4583, "title": "Model Criticism for Long-Form Text Generation", "authors": "Yuntian Deng, Volodymyr Kuleshov and Alexander Rush", "abstract": "Language models have demonstrated the ability to generate highly fluent text; however, it remains unclear whether their output retains coherent high-level structure (e.g., story progression). Here, we propose to apply a statistical tool, model criticism in latent space, to evaluate the high-level structure of the generated text. Model criticism compares the distributions between real and generated data in a latent space obtained according to an assumptive generative process. Different generative processes identify specific failure modes of the underlying model. We perform experiments on three representative aspects of high-level discourse---coherence, coreference, and topicality---and find that transformer-based language models are able to capture topical structures but have a harder time maintaining structural coherence or modeling coreference.", "track": "Language Modeling and Analysis of Language Models", "label": 2}, {"loc": [3.926274299621582, 9.760669708251953], "id": 4586, "title": "Improving Faithfulness by Augmenting Negative Summaries from Fake Documents", "authors": "Tianshu Wang, Faisal Ladhak, Esin Durmus and He He", "abstract": "Current abstractive summarization systems tend to hallucinate content that is unfaithful to the source document, posing a risk of misinformation. To mitigate hallucination, we must teach the model to distinguish hallucinated summaries from faithful ones. However, the commonly used maximum likelihood training does not disentangle factual errors from other model errors. To address this issue,\nwe propose a back-translation-style approach to augment negative samples that mimic factual errors made by the model. Specifically, we train an elaboration model that generates hallucinated documents given the reference summaries, and then generates negative summaries from the fake documents. We incorporate the negative samples into training through a controlled generator, which produces faithful/unfaithful summaries conditioned on the control codes. Additionally, we find that adding textual entailment data through multitasking further boosts the performance. Experiments on three datasets (XSum, Gigaword, and WikiHow) show that our method consistently improves faithfulness without sacrificing informativeness according to both human and automatic evaluation", "track": "Summarization", "label": 14}, {"loc": [0.5536993145942688, 7.035636901855469], "id": 4589, "title": "Joint Completion and Alignment of Multilingual Knowledge Graphs", "authors": "Soumen Chakrabarti, Harkanwar Singh, Shubham Lohiya, Prachi Jain and Mausam", "abstract": "Knowledge Graph Completion (KGC) predicts missing facts in an incomplete Knowledge Graph (KG). Multilingual KGs associate entities and relations with surface forms written in different languages. An entity or relation may be associated with distinct IDs in different KGs, necessitating entity alignment (EA) and relation alignment (RA). Many effective algorithms have been proposed for completion and alignment as separate tasks. Here we show that these tasks are synergistic and best solved together. Our multitask approach starts with a state-of-the-art KG embedding scheme, but adds a novel relation representation based on sets of embeddings of (subject, object) entity pairs. This representation leads to a new relation alignment loss term based on a maximal bipartite matching between two sets of embedding vectors. This loss is combined with traditional KGC loss and optionally, losses based on text embeddings of entity (and relation) names. In experiments over KGs in seven languages, we find that our system achieves large improvements in KGC compared to a strong completion model that combines known facts in all languages. It also outperforms strong EA and RA baselines, underscoring the value of joint alignment and completion.", "track": "Multilinguality", "label": 13}, {"loc": [4.616734504699707, 6.277368545532227], "id": 4592, "title": "Offer a Different Perspective: Modeling the Belief Alignment of Arguments in Multi-party Debates", "authors": "Suzanna Sia, Kokil Jaidka, Hansin Ahuja, Niyati Chhaya and Kevin Duh", "abstract": "In contexts where debate and deliberation are the norm, the participants are regularly presented with new information that conflicts with their original beliefs. When required to update their beliefs (belief alignment), they may choose arguments that align with their worldview (confirmation bias). We test this and competing hypotheses in a constraint-based modeling approach to predict the winning arguments in multi-party interactions in the Reddit Change My View and Intelligence Squared debates datasets. We adopt a hierarchical generative Variational Autoencoder as our model and impose structural constraints that reflect competing hypotheses about the nature of argumentation. Our findings suggest that in most settings, predictive models that anticipate winning arguments to be further from the initial argument of the opinion holder are more likely to succeed.", "track": "Computational Social Science and Cultural Analytics", "label": 20}, {"loc": [7.500738143920898, 12.303512573242188], "id": 4594, "title": "A Federated Approach to Predicting Emojis in Hindi Tweets", "authors": "Deep Rajesh Gandhi, Jash Jayesh Mehta, Nirali Parekh, Karan Waghela, Lynette D'Mello and Zeerak Talat", "abstract": "The use of emojis affords a visual modality to, often private, textual communication.\nThe task of predicting emojis however provides a challenge for machine learning as emoji use tends to cluster into the frequently used and the rarely used emojis.\nMuch of the machine learning research on emoji use has focused on high resource languages and has conceptualised the task of predicting emojis around traditional server-side machine learning approaches.\nHowever, traditional machine learning approaches for private communication can introduce privacy concerns, as these approaches require all data to be transmitted to a central storage.\nIn this paper, we seek to address the dual concerns of emphasising high resource languages for emoji prediction and risking the privacy of people's data.\nWe introduce a new dataset of 118k tweets (augmented from 25k unique tweets) for emoji prediction in Hindi, and propose a modification to the federated learning algorithm, CausalFedGSD, which aims to strike a balance between model performance and user privacy. We show that our approach obtains comparative scores with more complex centralised models while reducing the amount of data required to optimise the models and minimising risks to user privacy.", "track": "NLP Applications", "label": 0}, {"loc": [4.06273078918457, 7.032609939575195], "id": 4595, "title": "Injecting Domain Knowledge in Language Models for Task-oriented Dialogue Systems", "authors": "Denis Emelin, Daniele Bonadiman, Sawsan Alqahtani, Yi Zhang and Saab Mansour", "abstract": "Pre-trained language models (PLM) have advanced the state-of-the-art across NLP applications, but lack domain-specific knowledge that does not naturally occur in pre-training data. Previous studies augmented PLMs with symbolic knowledge for different downstream NLP tasks. However, knowledge bases (KBs) utilized in these studies are usually large-scale and static, in contrast to small, domain-specific, and modifiable knowledge bases that are prominent in real-world task-oriented dialogue (TOD) systems. In this paper, we showcase the advantages of injecting domain-specific knowledge prior to fine-tuning on TOD tasks. To this end, we utilize light-weight adapters that can be easily integrated with PLMs and serve as a repository for facts learned from different KBs. To measure the efficacy of proposed knowledge injection methods, we introduce Knowledge Probing using Response Selection (KPRS) -- a probe designed specifically for TOD models. Experiments on KPRS and the response generation task show improvements of knowledge injection with adapters over strong baselines.", "track": "Dialogue and Interactive Systems", "label": 4}, {"loc": [3.3480262756347656, 4.083223819732666], "id": 4603, "title": "TASA: Deceiving Question Answering Models by Twin Answer Sentences Attack", "authors": "Yu Cao, Dianqi Li, Meng Fang, Tianyi Zhou, Jun Gao, Yibing Zhan and Dacheng Tao", "abstract": "We present Twin Answer Sentences Attack (TASA), an adversarial attack method for question answering (QA) models that produces fluent and grammatical adversarial contexts while maintaining gold answers. Despite phenomenal progress on general adversarial attacks, few works have investigated the vulnerability and attack specifically for QA models. In this work, we first explore the biases in the existing models and discover that they mainly rely on keyword matching between the question and context, and ignore the relevant contextual relations for answer prediction.\nBased on two biases above, TASA attacks the target model in two folds: (1) lowering the model's confidence on the gold answer with a perturbed answer sentence; \n(2) misguiding the model towards a wrong answer with a distracting answer sentence. Equipped with designed beam search and filtering methods, TASA can generate more effective attacks than existing textual attack methods while sustaining the quality of contexts, in extensive experiments on five QA datasets and human evaluations.", "track": "Interpretability, Interactivity and Analysis of Models for NLP", "label": 9}, {"loc": [9.21965217590332, 6.348487377166748], "id": 4604, "title": "Improving Low-Resource Languages in Pre-Trained Multilingual Language Models", "authors": "Viktor Hangya, Hossain Shaikh Saadi and Alexander Fraser", "abstract": "Pre-trained multilingual language models are the foundation of many NLP approaches, including cross-lingual transfer solutions. However, languages with small available monolingual corpora are often not well-supported by these models leading to poor performance. We propose an unsupervised approach to improve the cross-lingual representations of low-resource languages by bootstrapping word translation pairs from monolingual corpora and using them to improve language alignment in pre-trained language models. We perform experiments on nine languages, using contextual word retrieval and zero-shot named entity recognition to measure both intrinsic cross-lingual word representation quality and downstream task performance, showing improvements on both tasks. Our results show that it is possible to improve pre-trained multilingual language models by relying only on non-parallel resources.", "track": "Multilinguality", "label": 13}, {"loc": [3.967482566833496, 9.883973121643066], "id": 4607, "title": "SCROLLS: Standardized CompaRison Over Long Language Sequences", "authors": "Uri Shaham, Elad Segal, Maor Ivgi, Avia Efrat, Ori Yoran, Adi Haviv, Ankit Gupta, Wenhan Xiong, Mor Geva, Jonathan Berant and Omer Levy", "abstract": "NLP benchmarks have largely focused on short texts, such as sentences and paragraphs, even though long texts comprise a considerable amount of natural language in the wild. We introduce SCROLLS, a suite of tasks that require reasoning over long texts. We examine existing long-text datasets, and handpick ones where the text is naturally long, while prioritizing tasks that involve synthesizing information across the input. SCROLLS contains summarization, question answering, and natural language inference tasks, covering multiple domains, including literature, science, business, and entertainment. Initial baselines, including Longformer Encoder-Decoder, indicate that there is ample room for improvement on SCROLLS. We make all datasets available in a unified text-to-text format and host a live leaderboard to facilitate research on model architecture and pretraining methods.", "track": "Resources and Evaluation", "label": 1}, {"loc": [4.681589603424072, 6.0058979988098145], "id": 4608, "title": "PAR: Political Actor Representation Learning with Social Context and Expert Knowledge", "authors": "Shangbin Feng, Zhaoxuan Tan, Zilong Chen, Ningnan Wang, Peisheng Yu, Qinghua Zheng, Xiaojun Chang and Minnan Luo", "abstract": "Modeling the ideological perspectives of political actors is an essential task in computational political science with applications in many downstream tasks. Existing approaches are generally limited to textual data and voting records, while they neglect the rich social context and valuable expert knowledge for holistic ideological analysis. In this paper, we propose PAR, a Political Actor Representation learning framework that jointly leverages social context and expert knowledge. Specifically, we retrieve and extract factual statements about legislators to leverage social context information. We then construct a heterogeneous information network to incorporate social context and use relational graph neural networks to learn legislator representations. Finally, we train PAR with three objectives to align representation learning with expert knowledge, model ideological stance consistency, and simulate the echo chamber phenomenon. Extensive experiments demonstrate that PAR is better at augmenting political text understanding and successfully advances the state-of-the-art in political perspective detection and roll call vote prediction. Further analysis proves that PAR learns representations that reflect the political reality and provide new insights into political behavior.", "track": "NLP Applications", "label": 0}, {"loc": [5.776094913482666, 12.05517578125], "id": 4613, "title": "JDDC 2.1: A Multimodal Chinese Dialogue Dataset with Joint Tasks of Query Rewriting, Response Generation, Discourse Parsing, and Summarization", "authors": "Nan Zhao, Haoran Li, Youzheng Wu and Xiaodong He", "abstract": "The popularity of multimodal dialogue has stimulated the need for a new generation of dialogue agents with multimodal interactivity.\nWhen users communicate with customer service, they may express their requirements by means of text, images, or even videos. \nVisual information usually acts as discriminators for product models, or indicators of product failures, which play an important role in the E-commerce scenario.\nOn the other hand, detailed information provided by the images is limited, and typically, customer service systems cannot understand the intent of users without the input text.\nThus, bridging the gap between the image and text is crucial for communicating with customers.\nIn this paper, we construct JDDC 2.1, a large-scale multimodal multi-turn dialogue dataset collected from a mainstream Chinese E-commerce platform, containing about 246K dialogue sessions, 3M utterances, and 507K images, along with product knowledge bases and image category annotations. \nOver our dataset, we jointly define four tasks: \nthe multimodal dialogue response generation task,\nthe multimodal query rewriting task, the multimodal dialogue discourse parsing task, and the multimodal dialogue summarization task.\nJDDC 2.1 is the first corpus with annotations for all the above tasks over the same dialogue sessions, which facilitates the comprehensive research around the dialogue.\nIn addition, we present several text-only and multimodal baselines and show the importance of visual information for these tasks. Our dataset and implements will be publicly available.", "track": "Resources and Evaluation", "label": 1}, {"loc": [8.05459213256836, 5.738705158233643], "id": 4618, "title": "PCL: Peer-Contrastive Learning with Diverse Augmentations for Unsupervised Sentence Embeddings", "authors": "Qiyu Wu, Chongyang Tao, Tao Shen, Can Xu, Xiubo Geng and Daxin Jiang", "abstract": "Learning sentence embeddings in an unsupervised manner is fundamental in natural language processing. Recent common practice is to couple pre-trained language models with unsupervised contrastive learning, whose success relies on augmenting a sentence with a semantically-close positive instance to construct contrastive pairs. Nonetheless, existing approaches usually depend on a mono-augmenting strategy, which causes learning shortcuts towards the augmenting biases and thus corrupts the quality of sentence embeddings. A straightforward solution is resorting to more diverse positives from a multi-augmenting strategy, while an open question remains about how to unsupervisedly learn from the diverse positives but with uneven augmenting qualities in the text field. As one answer, we propose a novel Peer-Contrastive Learning (PCL) with diverse augmentations. PCL constructs diverse contrastive positives and negatives at the group level for unsupervised sentence embeddings. PCL performs peer-positive contrast as well as peer-network cooperation, which offers an inherent anti-bias ability and an effective way to learn from diverse augmentations. Experiments on STS benchmarks verify the effectiveness of PCL against its competitors in unsupervised sentence embeddings.", "track": "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "label": 8}, {"loc": [10.770026206970215, 6.930694103240967], "id": 4621, "title": "Digging Errors in NMT: Evaluating and Understanding Model Errors from Partial Hypothesis Space", "authors": "Jianhao Yan, Chenming Wu, Fandong Meng and Jie Zhou", "abstract": "Solid evaluation of neural machine translation (NMT) is key to its understanding and improvement. Current evaluation of an NMT system is usually built upon a heuristic decoding algorithm (e.g., beam search) and an evaluation metric assessing similarity between the translation and golden reference. However, this system-level evaluation framework is limited by evaluating only one best hypothesis and search errors brought by heuristic decoding algorithms. To better understand NMT models, we propose a novel evaluation protocol, which defines model errors with model's ranking capability over hypothesis space. To tackle the problem of exponentially large space, we propose two approximation methods, top region evaluation along with an exact top-k decoding algorithm, which finds top-ranked hypotheses in the whole hypothesis space, and Monte Carlo sampling evaluation, which simulates hypothesis space from a broader perspective. To quantify errors, we define our NMT model errors by measuring distance between the hypothesis array ranked by the model and the ideally ranked hypothesis array. After confirming the strong correlation with human judgment, we apply our evaluation to various NMT benchmarks and model architectures. We show that the state-of-the-art Transformer models face serious ranking issues and only perform at the random chance level in the top region. We further analyze model errors on architectures with different depths and widths, as well as different data-augmentation techniques, showing how these factors affect model errors. Finally, we connect model errors with the search algorithms and provide interesting findings of beam search inductive bias and correlation with Minimum Bayes Risk (MBR) decoding.", "track": "Machine Translation", "label": 10}, {"loc": [4.0284504890441895, 7.303844928741455], "id": 4622, "title": "DialogConv: A Lightweight Fully Convolutional Network for Multi-view Response Selection", "authors": "Yongkang Liu, Shi Feng, Wei Gao, Daling Wang and Yifei Zhang", "abstract": "Current end-to-end retrieval-based dialogue systems are mainly based on Recurrent Neural Networks or Transformers with attention \nmechanisms. Although promising results have been achieved, these models often suffer from slow inference or huge number of parameters. In this paper, we propose a novel lightweight fully convolutional architecture, called DialogConv, for response selection. DialogConv is exclusively built on top of convolution to extract matching features of context and response. Dialogues are modeled in 3D views, where DialogConv performs convolution operations on embedding view, word view and utterance view to capture richer semantic information from multiple contextual views. On the four benchmark datasets, compared with state-of-the-art baselines, DialogConv is on average about 8.5x smaller in size, and 79.39x and 10.64x faster on CPU and GPU devices, respectively. At the same time, DialogConv achieves the competitive effectiveness of response selection.", "track": "Dialogue and Interactive Systems", "label": 4}]; var labels = ["NLP Applications", "Resources and Evaluation", "Language Modeling and Analysis of Language Models", "Machine Learning for NLP", "Dialogue and Interactive Systems", "Information Extraction", "Natural Language Generation", "Speech, Vision, Robotics, Multimodal Grounding", "Semantics: Lexical, Sentence level, Textual Inference and Other areas", "Interpretability, Interactivity and Analysis of Models for NLP", "Machine Translation", "Question Answering", "Efficient Methods for NLP", "Multilinguality", "Summarization", "Information Retrieval and Text Mining", "Sentiment Analysis, Stylistic Analysis, and Argument Mining", "Unsupervised and Weakly-Supervised Methods in NLP", "Theme Track", "Commonsense Reasoning", "Computational Social Science and Cultural Analytics", "Ethics", "Linguistic Theories, Cognitive Modeling and Psycholinguistics", "Syntax, Parsing and their Applications", "Discourse and Pragmatics", "Phonology, Morphology and Word Segmentation", "Ethic Concerns:Ethics", "Ethic Concerns:Resources and Evaluation", "Ethic Concerns:Speech, Vision, Robotics, Multimodal Grounding", "Ethic Concerns:Unsupervised and Weakly-Supervised Methods in NLP", "Ethic Concerns:Language Modeling and Analysis of Language Models", "Ethic Concerns:Dialogue and Interactive Systems", "Ethic Concerns:Multilinguality", "Ethic Concerns:Linguistic Theories, Cognitive Modeling and Psycholinguistics", "Ethic Concerns:NLP Applications", "Ethic Concerns:Efficient Methods for NLP", "Ethic Concerns:Theme Track", "Ethic Concerns:Summarization", "Ethic Concerns:Discourse and Pragmatics", "Ethic Concerns:Computational Social Science and Cultural Analytics", "Ethic Concerns:Sentiment Analysis, Stylistic Analysis, and Argument Mining"];