diff --git "a/data/acl2023_spectre2-base.json" "b/data/acl2023_spectre2-base.json"
new file mode 100644--- /dev/null
+++ "b/data/acl2023_spectre2-base.json"
@@ -0,0 +1,26510 @@
+[
+  {
+    "idx": 1,
+    "title": "Program Chairs\u2019 Report on Peer Review at ACL 2023",
+    "abstract": "We present a summary of the efforts to improve conference peer review that were implemented at ACL\u201923. This includes work with the goal of improving review quality, clearer workflow and decision support for the area chairs, as well as our efforts to improve paper-reviewer matching for various kinds of non- mainstream NLP work, and improve the overall incentives for all participants of the peer review process. We present analysis of the factors affecting peer review, identify the most problematic issues that the authors complained about, and provide suggestions for the future chairs. We hope that publishing such reports would (a) improve transparency in decision-making, (b) help the people new to the field to understand how the *ACL conferences work, (c) provide useful data for the future chairs and workshop organizers, and also academic work on peer review, and (d) provide useful context for the final program, as a source of information for meta-research on the structure and trajectory of the field of NLP.",
+    "authors": [
+      "Anna Rogers",
+      "Marzena Karpinska",
+      "Jordan Boyd-Graber",
+      "Naoaki Okazaki"
+    ],
+    "year": 2023,
+    "point2d": [
+      19.2919921875,
+      17.371660232543945
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 2,
+    "title": "One Cannot Stand for Everyone! Leveraging Multiple User Simulators to train Task-oriented Dialogue Systems",
+    "abstract": "User simulators are agents designed to imitate human users; recent advances have found that Task-oriented Dialogue (ToD) systems optimized toward a user simulator could better satisfy the need of human users. However, this might result in a sub-optimal ToD system if it is tailored to only one ad hoc user simulator, since human users can behave differently. In this paper, we propose a framework called MUST to optimize ToD systems via leveraging Multiple User SimulaTors. The main challenges of implementing MUST fall in 1) how to adaptively determine which user simulator to interact with the ToD system at each optimization step, since the ToD system might be over-fitted to some specific user simulators, and simultaneously under-fitted to some others; 2) how to avoid catastrophic forgetting of the adaption for a simulator that is not selected for several consecutive optimization steps.To tackle these challenges, we formulate MUST as a Multi-armed bandits (MAB) problem and provide a method called MUST_{\\mathrm{adaptive}} that balances i) the boosting adaption for adaptive interactions between different user simulators and the ToD system andii) the uniform adaption to avoid the catastrophic forgetting issue.With both automatic evaluations and human evaluations, our experimental results on MultiWOZ show that the dialogue system trained by MUST achieves a better performance than those trained by a single user simulator. It also has a better generalization ability when testing with unseen user simulators.",
+    "authors": [
+      "Yajiao Liu",
+      "Xin Jiang",
+      "Yichun Yin",
+      "Yasheng Wang",
+      "Fei Mi",
+      "Qun Liu",
+      "Xiang Wan",
+      "Benyou Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.1",
+    "point2d": [
+      17.426937103271484,
+      75.23506927490234
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 3,
+    "title": "SafeConv: Explaining and Correcting Conversational Unsafe Behavior",
+    "abstract": "One of the main challenges open-domain end-to-end dialogue systems, or chatbots, face is the prevalence of unsafe behavior, such as toxic languages and harmful suggestions. However, existing dialogue datasets do not provide enough annotation to explain and correct such unsafe behavior. In this work, we construct a new dataset called SafeConv for the research of conversational safety: (1) Besides the utterance-level safety labels, SafeConv also provides unsafe spans in an utterance, information able to indicate which words contribute to the detected unsafe behavior; (2) SafeConv provides safe alternative responses to continue the conversation when unsafe behavior detected, guiding the conversation to a gentle trajectory. By virtue of the comprehensive annotation of SafeConv, we benchmark three powerful models for the mitigation of conversational unsafe behavior, including a checker to detect unsafe utterances, a tagger to extract unsafe spans, and a rewriter to convert an unsafe response to a safe version. Moreover, we explore the huge benefits brought by combining the models for explaining the emergence of unsafe behavior and detoxifying chatbots. Experiments show that the detected unsafe behavior could be well explained with unsafe spans and popular chatbots could be detoxified by a huge extent. The dataset is available at https://github.com/mianzhang/SafeConv.",
+    "authors": [
+      "Mian Zhang",
+      "Lifeng Jin",
+      "Linfeng Song",
+      "Haitao Mi",
+      "Wenliang Chen",
+      "Dong Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.2",
+    "point2d": [
+      16.403419494628906,
+      35.5329475402832
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 4,
+    "title": "Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better",
+    "abstract": "While the problem of hallucinations in neural machine translation has long been recognized, so far the progress on its alleviation is very little. Indeed, recently it turned out that without artificially encouraging models to hallucinate, previously existing methods fall short and even the standard sequence log-probability is more informative. It means that internal characteristics of the model can give much more information than we expect, and before using external models and measures, we first need to ask: how far can we go if we use nothing but the translation model itself ? We propose to use a method that evaluates the percentage of the source contribution to a generated translation. Intuitively, hallucinations are translations \u201cdetached\u201d from the source, hence they can be identified by low source contribution. This method improves detection accuracy for the most severe hallucinations by a factor of 2 and is able to alleviate hallucinations at test time on par with the previous best approach that relies on external models. Next, if we move away from internal model characteristics and allow external tools, we show that using sentence similarity from cross-lingual embeddings further improves these results. We release the code of our experiments.",
+    "authors": [
+      "David Dale",
+      "Elena Voita",
+      "Loic Barrault",
+      "Marta R. Costa-juss\u00e0"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.3",
+    "point2d": [
+      -76.25108337402344,
+      -10.936566352844238
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 5,
+    "title": "Explainable Recommendation with Personalized Review Retrieval and Aspect Learning",
+    "abstract": "Explainable recommendation is a technique that combines prediction and generation tasks to produce more persuasive results. Among these tasks, textual generation demands large amounts of data to achieve satisfactory accuracy. However, historical user reviews of items are often insufficient, making it challenging to ensure the precision of generated explanation text. To address this issue, we propose a novel model, ERRA (Explainable Recommendation by personalized Review retrieval and Aspect learning). With retrieval enhancement, ERRA can obtain additional information from the training sets. With this additional information, we can generate more accurate and informative explanations. Furthermore, to better capture users\u2019 preferences, we incorporate an aspect enhancement component into our model. By selecting the top-n aspects that users are most concerned about for different items, we can model user representation with more relevant details, making the explanation more persuasive. To verify the effectiveness of our model, extensive experiments on three datasets show that our model outperforms state-of-the-art baselines (for example, 3.4% improvement in prediction and 15.8% improvement in explanation for TripAdvisor).",
+    "authors": [
+      "Hao Cheng",
+      "Shuo Wang",
+      "Wensheng Lu",
+      "Wei Zhang",
+      "Mingyang Zhou",
+      "Kezhong Lu",
+      "Hao Liao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.4",
+    "point2d": [
+      2.5511391162872314,
+      36.31180191040039
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 6,
+    "title": "Binary and Ternary Natural Language Generation",
+    "abstract": "Ternary and binary neural networks enable multiplication-free computation and promise multiple orders of magnitude efficiency gains over full-precision networks if implemented on specialized hardware. However, since both the parameter and the output space are highly discretized, such networks have proven very difficult to optimize. The difficulties are compounded for the class of transformer text generation models due to the sensitivity of the attention operation to quantization and the noise-compounding effects of autoregressive decoding in the high-cardinality output space. We approach the problem with a mix of statistics-based quantization for the weights and elastic quantization of the activations and demonstrate the first ternary and binary transformer models on the downstream tasks of summarization and machine translation. Our ternary BART base achieves an R1 score of 41 on the CNN/DailyMail benchmark, which is merely 3.9 points behind the full model while being 16x more efficient. Our binary model, while less accurate, achieves a highly non-trivial score of 35.6. For machine translation, we achieved BLEU scores of 21.7 and 17.6 on the WMT16 En-Ro benchmark, compared with a full precision mBART model score of 26.8. We also compare our approach in the 8-bit activation setting, where our ternary and even binary weight models can match or outperform the best existing 8-bit weight models in the literature. Our code and models are available at: https://github.com/facebookresearch/Ternary_Binary_Transformer.",
+    "authors": [
+      "Zechun Liu",
+      "Barlas Oguz",
+      "Aasish Pappu",
+      "Yangyang Shi",
+      "Raghuraman Krishnamoorthi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.5",
+    "point2d": [
+      -31.183757781982422,
+      9.79224967956543
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 7,
+    "title": "Span-Selective Linear Attention Transformers for Effective and Robust Schema-Guided Dialogue State Tracking",
+    "abstract": "In schema-guided dialogue state tracking models estimate the current state of a conversation using natural language descriptions of the service schema for generalization to unseen services. Prior generative approaches which decode slot values sequentially do not generalize well to variations in schema, while discriminative approaches separately encode history and schema and fail to account for inter-slot and intent-slot dependencies. We introduce SPLAT, a novel architecture which achieves better generalization and efficiency than prior approaches by constraining outputs to a limited prediction space. At the same time, our model allows for rich attention among descriptions and history while keeping computation costs constrained by incorporating linear-time attention. We demonstrate the effectiveness of our model on the Schema-Guided Dialogue (SGD) and MultiWOZ datasets. Our approach significantly improves upon existing models achieving 85.3 JGA on the SGD dataset. Further, we show increased robustness on the SGD-X benchmark: our model outperforms the more than 30x larger D3ST-XXL model by 5.0 points.",
+    "authors": [
+      "Bj\u00f6rn Bebensee",
+      "Haejun Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.6",
+    "point2d": [
+      0.8033410906791687,
+      71.99784851074219
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 8,
+    "title": "EM Pre-training for Multi-party Dialogue Response Generation",
+    "abstract": "Dialogue response generation requires an agent to generate a response according to the current dialogue history, in terms of which two-party dialogues have been well studied, but leaving a great gap for multi-party dialogues at the same time. Different from two-party dialogues where each response is a direct reply to its previous utterance, the addressee of a response utterance should be specified before it is generated in the multi-party scenario. Thanks to the huge amount of two-party conversational data, various pre-trained language models for two-party dialogue response generation have been proposed. However, due to the lack of annotated addressee labels in multi-party dialogue datasets, it is hard to use them to pre-train a response generation model for multi-party dialogues. To tackle this obstacle, we propose an Expectation-Maximization (EM) approach that iteratively performs the expectation steps to generate addressee labels, and the maximization steps to optimize a response generation model. Theoretical analyses and extensive experiments have justified the feasibility and effectiveness of our proposed method. The official implementation of this paper is available at https://github.com/EricLee8/MPDRG.",
+    "authors": [
+      "Yiyang Li",
+      "Hai Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.7",
+    "point2d": [
+      12.60263729095459,
+      65.29109191894531
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 9,
+    "title": "ACLM: A Selective-Denoising based Generative Data Augmentation Approach for Low-Resource Complex NER",
+    "abstract": "Complex Named Entity Recognition (NER) is the task of detecting linguistically complex named entities in low-context text. In this paper, we present ACLM Attention-map aware keyword selection for Conditional Language Model fine-tuning), a novel data augmentation approach based on conditional generation, to address the data scarcity problem in low-resource complex NER. ACLM alleviates the context-entity mismatch issue, a problem existing NER data augmentation techniques suffer from and often generates incoherent augmentations by placing complex named entities in the wrong context. ACLM builds on BART and is optimized on a novel text reconstruction or denoising task - we use selective masking (aided by attention maps) to retain the named entities and certain keywords in the input sentence that provide contextually relevant additional knowledge or hints about the named entities. Compared with other data augmentation strategies, ACLM can generate more diverse and coherent augmentations preserving the true word sense of complex entities in the sentence. We demonstrate the effectiveness of ACLM both qualitatively and quantitatively on monolingual, cross-lingual, and multilingual complex NER across various low-resource settings. ACLM outperforms all our neural baselines by a significant margin (1%-36%). In addition, we demonstrate the application of ACLM to other domains that suffer from data scarcity (e.g., biomedical). In practice, ACLM generates more effective and factual augmentations for these domains than prior methods.",
+    "authors": [
+      "Sreyan Ghosh",
+      "Utkarsh Tyagi",
+      "Manan Suri",
+      "Sonal Kumar",
+      "Ramaneswaran S",
+      "Dinesh Manocha"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.8",
+    "point2d": [
+      36.30806350708008,
+      -87.10093688964844
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 10,
+    "title": "Natural Language to Code Generation in Interactive Data Science Notebooks",
+    "abstract": "Computational notebooks, such as Jupyter notebooks, are interactive computing environments that are ubiquitous among data scientists to perform data wrangling and analytic tasks. To measure the performance of AI pair programmers that automatically synthesize programs for those tasks given natural language (NL) intents from users, we build ARCADE, a benchmark of 1078 code generation problems using the pandas data analysis framework in data science notebooks. ARCADE features multiple rounds of NL-to-code problems from the same notebook. It requires a model to understand rich multi-modal contexts, such as existing notebook cells and their execution states as well as previous turns of interaction. To establish a strong baseline on this challenging task, we develop PaChiNCo, a 62B code language model (LM) for Python computational notebooks, which significantly outperforms public code LMs. Finally, we explore few-shot prompting strategies to elicit better code with step-by-step decomposition and NL explanation, showing the potential to improve the diversity and explainability of model predictions. Arcade is publicly available at https://github.com/google-research/arcade-nl2code/.",
+    "authors": [
+      "Pengcheng Yin",
+      "Wen-Ding Li",
+      "Kefan Xiao",
+      "Abhishek Rao",
+      "Yeming Wen",
+      "Kensen Shi",
+      "Joshua Howland",
+      "Paige Bailey",
+      "Michele Catasta",
+      "Henryk Michalewski",
+      "Oleksandr Polozov",
+      "Charles Sutton"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.9",
+    "point2d": [
+      -6.966467380523682,
+      -51.46390914916992
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 11,
+    "title": "Subset Retrieval Nearest Neighbor Machine Translation",
+    "abstract": "k-nearest-neighbor machine translation (kNN-MT) (Khandelwal et al., 2021) boosts the translation performance of trained neural machine translation (NMT) models by incorporating example-search into the decoding algorithm. However, decoding is seriously time-consuming, i.e., roughly 100 to 1,000 times slower than standard NMT, because neighbor tokens are retrieved from all target tokens of parallel data in each timestep. In this paper, we propose \u201cSubset kNN-MT\u201d, which improves the decoding speed of kNN-MT by two methods: (1) retrieving neighbor target tokens from a subset that is the set of neighbor sentences of the input sentence, not from all sentences, and (2) efficient distance computation technique that is suitable for subset neighbor search using a look-up table. Our proposed method achieved a speed-up of up to 132.2 times and an improvement in BLEU score of up to 1.6 compared with kNN-MT in the WMT\u201919 De-En translation task and the domain adaptation tasks in De-En and En-Ja.",
+    "authors": [
+      "Hiroyuki Deguchi",
+      "Taro Watanabe",
+      "Yusuke Matsui",
+      "Masao Utiyama",
+      "Hideki Tanaka",
+      "Eiichiro Sumita"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.10",
+    "point2d": [
+      -67.02169036865234,
+      -14.6912841796875
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 12,
+    "title": "MIL-Decoding: Detoxifying Language Models at Token-Level via Multiple Instance Learning",
+    "abstract": "Despite advances in large pre-trained neural language models, they are prone to generating toxic language, which brings security risks to their applications.We introduce MIL-Decoding, which detoxifies language models at token-level by interpolating it with a trained multiple instance learning (MIL) network.MIL model is trained on a corpus with a toxicity label for each text to predict the overall toxicity and the toxicity of each token in its context.Intuitively, the MIL network computes a toxicity distribution over next tokens according to the generated context which supplements the original language model to avoid toxicity.We evaluate MIL-Decoding with automatic metrics and human evaluation, where MIL-Decoding outperforms other baselines in detoxification while it only hurts generation fluency a little bit.",
+    "authors": [
+      "Xu Zhang",
+      "Xiaojun Wan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.11",
+    "point2d": [
+      6.752921104431152,
+      13.692534446716309
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 13,
+    "title": "Dependency resolution at the syntax-semantics interface: psycholinguistic and computational insights on control dependencies",
+    "abstract": "Using psycholinguistic and computational experiments we compare the ability of humans and several pre-trained masked language models to correctly identify control dependencies in Spanish sentences such as \u2018Jos\u00e9 le prometi\u00f3/orden\u00f3 a Mar\u00eda ser ordenado/a\u2019 (\u2018Joseph promised/ordered Mary to be tidy\u2019). These structures underlie complex anaphoric and agreement relations at the interface of syntax and semantics, allowing us to study lexically-guided antecedent retrieval processes. Our results show that while humans correctly identify the (un)acceptability of the strings, language models often fail to identify the correct antecedent in non-adjacent dependencies, showing their reliance on linearity. Additional experiments on Galician reinforce these conclusions. Our findings are equally valuable for the evaluation of language models\u2019 ability to capture linguistic generalizations, as well as for psycholinguistic theories of anaphor resolution.",
+    "authors": [
+      "Iria de-Dios-Flores",
+      "Juan Garcia Amboage",
+      "Marcos Garcia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.12",
+    "point2d": [
+      -16.323644638061523,
+      -70.55197143554688
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 14,
+    "title": "Open-ended Long Text Generation via Masked Language Modeling",
+    "abstract": "Pre-trained autoregressive (AR) language models such as BART and GPTs have dominated OPen-ended Long Text Generation (Open-LTG).However, the AR nature will decrease the inference efficiency along with the increase of generation length, which hinder their application in Open-LTG.To improve inference efficiency, we alternatively explore the potential of the pre-trained masked language models (MLMs) along with a representative iterative non-autoregressive (NAR) decoding strategy for Open-LTG.Our preliminary study shows that pre-trained MLMs can merely generate short text and will collapse for long text modeling.To enhance the long text generation capability of MLMs, we introduce two simple yet effective strategies for the iterative NAR model: dynamic sliding window attention (DSWA) and linear temperature decay (LTD). It can alleviate long-distance collapse problems and achieve longer text generation with a flexible trade-off between performance and inference speedup.Experiments on the storytelling and multi-paragraph opinionated article writing tasks show that pre-trained MLMs can achieve more than 3 \\times\n        \\to 13 \\times speedup with better performance than strong AR models.",
+    "authors": [
+      "Xiaobo Liang",
+      "Zecheng Tang",
+      "Juntao Li",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.13",
+    "point2d": [
+      -29.008079528808594,
+      10.409829139709473
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 15,
+    "title": "A Method for Studying Semantic Construal in Grammatical Constructions with Interpretable Contextual Embedding Spaces",
+    "abstract": "We study semantic construal in grammatical constructions using large language models. First, we project contextual word embeddings into three interpretable semantic spaces, each defined by a different set of psycholinguistic feature norms. We validate these interpretable spaces and then use them to automatically derive semantic characterizations of lexical items in two grammatical constructions: nouns in subject or object position within the same sentence, and the AANN construction (e.g., \u2018a beautiful three days\u2019). We show that a word in subject position is interpreted as more agentive than the very same word in object position, and that the nouns in the AANN construction are interpreted as more measurement-like than when in the canonical alternation. Our method can probe the distributional meaning of syntactic constructions at a templatic level, abstracted away from specific lexemes.",
+    "authors": [
+      "Gabriella Chronis",
+      "Kyle Mahowald",
+      "Katrin Erk"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.14",
+    "point2d": [
+      8.220794677734375,
+      -49.56673049926758
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 16,
+    "title": "Holographic CCG Parsing",
+    "abstract": "We propose a method for formulating CCG as a recursive composition in a continuous vector space. Recent CCG supertagging and parsing models generally demonstrate high performance, yet rely on black-box neural architectures to implicitly model phrase structure dependencies. Instead, we leverage the method of holographic embeddings as a compositional operator to explicitly model the dependencies between words and phrase structures in the embedding space. Experimental results revealed that holographic composition effectively improves the supertagging accuracy to achieve state-of-the-art parsing performance when using a C&C parser. The proposed span-based parsing algorithm using holographic composition achieves performance comparable to state-of-the-art neural parsing with Transformers. Furthermore, our model can semantically and syntactically infill text at the phrase level due to the decomposability of holographic composition.",
+    "authors": [
+      "Ryosuke Yamaki",
+      "Tadahiro Taniguchi",
+      "Daichi Mochihashi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.15",
+    "point2d": [
+      -25.588573455810547,
+      -60.803131103515625
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 17,
+    "title": "Prompts Can Play Lottery Tickets Well: Achieving Lifelong Information Extraction via Lottery Prompt Tuning",
+    "abstract": "Thanks to the recent success of Pre-trained Language Models (PLMs), it has become a promising research direction to develop a universal model (UIE) that can solve all typical information extraction tasks within one generative framework. Nonetheless, in real-world scenarios of UIE applications, new data of different IE tasks and domains usually come in a stream over time. A desirable UIE system should be capable of continually learning new tasks without forgetting old ones, thereby allowing knowledge and functionalities expansion without re-training the whole system. In this paper, we study the UIE system under a more challenging yet practical scenario, i.e., \u201clifelong learning\u201d settings, to evaluate its abilities in three aspects, including knowledge sharing and expansion, catastrophic forgetting prevention, and rapid generalization on few-shot and unseen tasks.To achieve these three goals, we present a novel parameter- and deployment-efficient prompt tuning method namely Lottery Prompt Tuning (LPT).LPT freezes the PLM\u2019s parameters and sequentially learns compact pruned prompt vectors for each task leveraging a binary prompt mask, while keeping the prompt parameters selected by the previous tasks insusceptible.Furthermore, we use a simple yet effective method to perform mask selection and show the powerful transferability of Lottery Prompts to novel tasks.Extensive experiments demonstrate that LPT consistently sets state-of-the-art performance on multiple lifelong learning settings of UIE, including task-incremental setting on seen tasks, few-shot adaptation, and zero-shot generalization on novel tasks.",
+    "authors": [
+      "Zujie Liang",
+      "Feng Wei",
+      "Yin Jie",
+      "Yuxi Qian",
+      "Zhenghong Hao",
+      "Bing Han"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.16",
+    "point2d": [
+      -19.560606002807617,
+      -7.94928503036499
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 18,
+    "title": "Retrieve-and-Sample: Document-level Event Argument Extraction via Hybrid Retrieval Augmentation",
+    "abstract": "Recent studies have shown the effectiveness of retrieval augmentation in many generative NLP tasks. These retrieval-augmented methods allow models to explicitly acquire prior external knowledge in a non-parametric manner and regard the retrieved reference instances as cues to augment text generation. These methods use similarity-based retrieval, which is based on a simple hypothesis: the more the retrieved demonstration resembles the original input, the more likely the demonstration label resembles the input label. However, due to the complexity of event labels and sparsity of event arguments, this hypothesis does not always hold in document-level EAE. This raises an interesting question: How do we design the retrieval strategy for document-level EAE? We investigate various retrieval settings from the input and label distribution views in this paper. We further augment document-level EAE with pseudo demonstrations sampled from event semantic regions that can cover adequate alternatives in the same context and event schema. Through extensive experiments on RAMS and WikiEvents, we demonstrate the validity of our newly introduced retrieval-augmented methods and analyze why they work.",
+    "authors": [
+      "Yubing Ren",
+      "Yanan Cao",
+      "Ping Guo",
+      "Fang Fang",
+      "Wei Ma",
+      "Zheng Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.17",
+    "point2d": [
+      42.6258430480957,
+      -42.98621368408203
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 19,
+    "title": "WeCheck: Strong Factual Consistency Checker via Weakly Supervised Learning",
+    "abstract": "A crucial issue of current text generation models is that they often uncontrollably generate text that is factually inconsistent with inputs.Due to lack of annotated data, existing factual consistency metrics usually train evaluation models on synthetic texts or directly transfer from other related tasks, such as question answering (QA) and natural language inference (NLI).Bias in synthetic text or upstream tasks makes them perform poorly on text actually generated by language models, especially for general evaluation for various tasks.To alleviate this problem, we propose a weakly supervised framework named WeCheck that is directly trained on actual generated samples from language models with weakly annotated labels.WeCheck first utilizes a generative model to infer the factual labels of generated samples by aggregating weak labels from multiple resources.Next, we train a simple noise-aware classification model as the target metric using the inferred weakly supervised information.Comprehensive experiments on various tasks demonstrate the strong performance of WeCheck, achieving an average absolute improvement of 3.3% on the TRUE benchmark over 11B state-of-the-art methods using only 435M parameters.Furthermore, it is up to 30 times faster than previous evaluation methods, greatly improving the accuracy and efficiency of factual consistency evaluation.",
+    "authors": [
+      "Wenhao Wu",
+      "Wei Li",
+      "Xinyan Xiao",
+      "Jiachen Liu",
+      "Sujian Li",
+      "Yajuan Lyu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.18",
+    "point2d": [
+      25.63334083557129,
+      5.617829322814941
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 20,
+    "title": "AMR-based Network for Aspect-based Sentiment Analysis",
+    "abstract": "Aspect-based sentiment analysis (ABSA) is a fine-grained sentiment classification task. Many recent works have used dependency trees to extract the relation between aspects and contexts and have achieved significant improvements. However, further improvement is limited due to the potential mismatch between the dependency tree as a syntactic structure and the sentiment classification as a semantic task. To alleviate this gap, we replace the syntactic dependency tree with the semantic structure named Abstract Meaning Representation (AMR) and propose a model called AMR-based Path Aggregation Relational Network (APARN) to take full advantage of semantic structures. In particular, we design the path aggregator and the relation-enhanced self-attention mechanism that complement each other. The path aggregator extracts semantic features from AMRs under the guidance of sentence information, while the relation-enhanced self-attention mechanism in turn improves sentence features with refined semantic information. Experimental results on four public datasets demonstrate 1.13% average F1 improvement of APARN in ABSA when compared with state-of-the-art baselines.",
+    "authors": [
+      "Fukun Ma",
+      "Xuming Hu",
+      "Aiwei Liu",
+      "Yawen Yang",
+      "Shuang Li",
+      "Philip S. Yu",
+      "Lijie Wen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.19",
+    "point2d": [
+      17.86332130432129,
+      -35.557777404785156
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 21,
+    "title": "Text Adversarial Purification as Defense against Adversarial Attacks",
+    "abstract": "Adversarial purification is a successful defense mechanism against adversarial attacks without requiring knowledge of the form of the incoming attack.Generally, adversarial purification aims to remove the adversarial perturbations therefore can make correct predictions based on the recovered clean samples.Despite the success of adversarial purification in the computer vision field that incorporates generative models such as energy-based models and diffusion models,using purification as a defense strategy against textual adversarial attacks is rarely explored.In this work, we introduce a novel adversarial purification method that focuses on defending against textual adversarial attacks.With the help of language models, we can inject noise by masking input texts and reconstructing the masked texts based on the masked language models.In this way, we construct an adversarial purification process for textual models against the most widely used word-substitution adversarial attacks.We test our proposed adversarial purification method on several strong adversarial attack methods including Textfooler and BERT-Attack and experimental results indicate that the purification algorithm can successfully defend against strong word-substitution attacks.",
+    "authors": [
+      "Linyang Li",
+      "Demin Song",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.20",
+    "point2d": [
+      5.386714458465576,
+      6.481171607971191
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 22,
+    "title": "SPEECH: Structured Prediction with Energy-Based Event-Centric Hyperspheres",
+    "abstract": "Event-centric structured prediction involves predicting structured outputs of events. In most NLP cases, event structures are complex with manifold dependency, and it is challenging to effectively represent these complicated structured events. To address these issues, we propose Structured Prediction with Energy-based Event-Centric Hyperspheres (SPEECH). SPEECH models complex dependency among event structured components with energy-based modeling, and represents event classes with simple but effective hyperspheres. Experiments on two unified-annotated event datasets indicate that SPEECH is predominant in event detection and event-relation extraction tasks.",
+    "authors": [
+      "Shumin Deng",
+      "Shengyu Mao",
+      "Ningyu Zhang",
+      "Bryan Hooi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.21",
+    "point2d": [
+      49.04600143432617,
+      -50.1035041809082
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 23,
+    "title": "Rule By Example: Harnessing Logical Rules for Explainable Hate Speech Detection",
+    "abstract": "Classic approaches to content moderation typically apply a rule-based heuristic approach to flag content. While rules are easily customizable and intuitive for humans to interpret, they are inherently fragile and lack the flexibility or robustness needed to moderate the vast amount of undesirable content found online today. Recent advances in deep learning have demonstrated the promise of using highly effective deep neural models to overcome these challenges. However, despite the improved performance, these data-driven models lack transparency and explainability, often leading to mistrust from everyday users and a lack of adoption by many platforms. In this paper, we present Rule By Example (RBE): a novel exemplar-based contrastive learning approach for learning from logical rules for the task of textual content moderation. RBE is capable of providing rule-grounded predictions, allowing for more explainable and customizable predictions compared to typical deep learning-based approaches. We demonstrate that our approach is capable of learning rich rule embedding representations using only a few data examples. Experimental results on 3 popular hate speech classification datasets show that RBE is able to outperform state-of-the-art deep learning classifiers as well as the use of rules in both supervised and unsupervised settings while providing explainable model predictions via rule-grounding.",
+    "authors": [
+      "Christopher Clarke",
+      "Matthew Hall",
+      "Gaurav Mittal",
+      "Ye Yu",
+      "Sandra Sajeev",
+      "Jason Mars",
+      "Mei Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.22",
+    "point2d": [
+      12.9916353225708,
+      38.44235610961914
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 24,
+    "title": "What about \u201cem\u201d? How Commercial Machine Translation Fails to Handle (Neo-)Pronouns",
+    "abstract": "As 3rd-person pronoun usage shifts to include novel forms, e.g., neopronouns, we need more research on identity-inclusive NLP. Exclusion is particularly harmful in one of the most popular NLP applications, machine translation (MT). Wrong pronoun translations can discriminate against marginalized groups, e.g., non-binary individuals (Dev et al., 2021). In this \u201creality check\u201d, we study how three commercial MT systems translate 3rd-person pronouns. Concretely, we compare the translations of gendered vs. gender-neutral pronouns from English to five other languages (Danish, Farsi, French, German, Italian), and vice versa, from Danish to English.Our error analysis shows that the presence of a gender-neutral pronoun often leads to grammatical and semantic translation errors. Similarly, gender neutrality is often not preserved. By surveying the opinions of affected native speakers from diverse languages, we provide recommendations to address the issue in future MT research.",
+    "authors": [
+      "Anne Lauscher",
+      "Debora Nozza",
+      "Ehm Miltersen",
+      "Archie Crowley",
+      "Dirk Hovy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.23",
+    "point2d": [
+      -54.478424072265625,
+      -0.5633244514465332
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 25,
+    "title": "What Is Overlap Knowledge in Event Argument Extraction? APE: A Cross-datasets Transfer Learning Model for EAE",
+    "abstract": "The EAE task extracts a structured event record from an event text. Most existing approaches train the EAE model on each dataset independently and ignore the overlap knowledge across datasets. However, insufficient event records in a single dataset often prevent the existing model from achieving better performance. In this paper, we clearly define the overlap knowledge across datasets and split the knowledge of the EAE task into overlap knowledge across datasets and specific knowledge of the target dataset. We propose APE model to learn the two parts of knowledge in two serial learning phases without causing catastrophic forgetting. In addition, we formulate both learning phases as conditional generation tasks and design Stressing Entity Type Prompt to close the gap between the two phases. The experiments show APE achieves new state-of-the-art with a large margin in the EAE task. When only ten records are available in the target dataset, our model dramatically outperforms the baseline model with average 27.27% F1 gain.",
+    "authors": [
+      "Kaihang Zhang",
+      "Kai Shuang",
+      "Xinyue Yang",
+      "Xuyang Yao",
+      "Jinyu Guo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.24",
+    "point2d": [
+      43.80473327636719,
+      -45.091163635253906
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 26,
+    "title": "Tailor: A Soft-Prompt-Based Approach to Attribute-Based Controlled Text Generation",
+    "abstract": "Attribute-based Controlled Text Generation (CTG) refers to generating sentences that satisfy desirable attributes (e.g., emotions and topics). Existing work usually utilize fine-tuning or resort to extra attribute classifiers, yet suffer from increases in storage and inference time. To address these concerns, we explore attribute-based CTG in a parameter-efficient manner. In short, the proposed Tailor represents each attribute as a pre-trained continuous vector i.e., single-attribute prompt), which guides the generation of a fixed pre-trained language model (PLM) to satisfy a pre-specified attribute. These prompts can be simply concatenated as a whole for multi-attribute CTG without any re-training. Nevertheless, this may raise problems of fluency downgrading and position sensitivity. To solve this, Tailor provides two solutions to enhance the combination. The former contains a multi-attribute prompt mask and a re-indexing position sequence to bridge the gap between the training (one single-attribute prompt for each task) and the testing stage (concatenating two prompts). The latter introduces a trainable prompt connector to further enhance the combinations. Experiments demonstrate that, only requiring 0.08% extra training parameters of the GPT-2, Tailor can achieve effective and general improvements on eleven attribute-specific generation tasks.",
+    "authors": [
+      "Kexin Yang",
+      "Dayiheng Liu",
+      "Wenqiang Lei",
+      "Baosong Yang",
+      "Mingfeng Xue",
+      "Boxing Chen",
+      "Jun Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.25",
+    "point2d": [
+      -24.000396728515625,
+      7.029482364654541
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 27,
+    "title": "Knowledge of cultural moral norms in large language models",
+    "abstract": "Moral norms vary across cultures. A recent line of work suggests that English large language models contain human-like moral biases, but these studies typically do not examine moral variation in a diverse cultural setting. We investigate the extent to which monolingual English language models contain knowledge about moral norms in different countries. We consider two levels of analysis: 1) whether language models capture fine-grained moral variation across countries over a variety of topics such as \u201chomosexuality\u201d and \u201cdivorce\u201d; 2) whether language models capture cultural diversity and shared tendencies in which topics people around the globe tend to diverge or agree on in their moral judgment. We perform our analyses with two public datasets from the World Values Survey (across 55 countries) and PEW global surveys (across 40 countries) on morality. We find that pre-trained English language models predict empirical moral norms across countries worse than the English moral norms reported previously. However, fine-tuning language models on the survey data improves inference across countries at the expense of a less accurate estimate of the English moral norms. We discuss the relevance and challenges of incorporating cultural knowledge into the automated inference of moral norms.",
+    "authors": [
+      "Aida Ramezani",
+      "Yang Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.26",
+    "point2d": [
+      27.38128662109375,
+      32.53695297241211
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 28,
+    "title": "Songs Across Borders: Singable and Controllable Neural Lyric Translation",
+    "abstract": "The development of general-domain neural machine translation (NMT) methods has advanced significantly in recent years, but the lack of naturalness and musical constraints in the outputs makes them unable to produce singable lyric translations. This paper bridges the singability quality gap by formalizing lyric translation into a constrained translation problem, converting theoretical guidance and practical techniques from translatology literature to prompt-driven NMT approaches, exploring better adaptation methods, and instantiating them to an English-Chinese lyric translation system. Our model achieves 99.85%, 99.00%, and 95.52% on length accuracy, rhyme accuracy, and word boundary recall. In our subjective evaluation, our model shows a 75% relative enhancement on overall quality, compared against naive fine-tuning (Code available at https://github.com/Sonata165/ControllableLyricTranslation).",
+    "authors": [
+      "Longshen Ou",
+      "Xichu Ma",
+      "Min-Yen Kan",
+      "Ye Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.27",
+    "point2d": [
+      -33.40388488769531,
+      41.93490982055664
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 29,
+    "title": "Fantastic Expressions and Where to Find Them: Chinese Simile Generation with Multiple Constraints",
+    "abstract": "Similes occur in the creative context of describing a concept (i.e., tenor) by making a literally false yet figuratively meaningful comparison to another (i.e., vehicle). Previous efforts form simile generation as a context-free generation task, focusing on simile-style transfer or writing a simile from a given prefix. However, generated texts under such settings might be undesirable, such as hardly meeting the simile definition (e.g., missing vehicle) or difficult to address certain preferences of content as humans wish (e.g., describe the color of apples through the simile). We believe that a simile could be more qualified and user-oriented if incorporated with pre-specified constraints. To this end, we introduce controllable simile generation (CSG), a new task that requires the model to generate a simile with multiple simile elements, e.g., context and vehicle. To facilitate this task, we present GraCe, including 61.3k simile-element annotated Chinese similes. Based on it, we propose a CSG model Similor to benchmark this task, including a vehicle retrieval module Scorer to obtain the explicable comparison for a given tenor in the vehicle-unknown situation. Both statistical and experimental analyses show that GraCe is of high quality beyond all other Chinese simile datasets, in terms of the number (8 vs. 3) of annotation elements, Is-Simile accuracy (98.9% vs. 78.7%), and increasing model-performance gains for both uncontrollable and controllable simile generation. Meanwhile, Similor can serve as a strong baseline for CSG, especially with Scorer, which beats model-based retrieval methods without any re-training.",
+    "authors": [
+      "Kexin Yang",
+      "Dayiheng Liu",
+      "Wenqiang Lei",
+      "Baosong Yang",
+      "Xiangpeng Wei",
+      "Zhengyuan Liu",
+      "Jun Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.28",
+    "point2d": [
+      -26.935298919677734,
+      42.61740493774414
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 30,
+    "title": "Revealing Single Frame Bias for Video-and-Language Learning",
+    "abstract": "Training an effective video-and-language model intuitively requires multiple frames as model inputs. However, it is unclear whether using multiple frames is beneficial to downstream tasks, and if yes, whether the performance gain is worth the drastically-increased computation and memory costs resulting from using more frames. In this work, we explore single-frame models for video-and-language learning. On a diverse set of video-and-language tasks (including text-to-video retrieval and video question answering), we show the surprising result that, with large-scale pre-training and a proper frame ensemble strategy at inference time, a single-frame trained model that does not consider temporal information can achieve better performance than existing methods that use multiple frames for training. This result reveals the existence of a strong \u201cstatic appearance bias\u201d in popular video-and-language datasets. Therefore, to allow for a more comprehensive evaluation of video-and-language models, we propose two new retrieval tasks based on existing fine-grained action recognition datasets that encourage temporal modeling. Our code is available at https://github.com/jayleicn/singularity.",
+    "authors": [
+      "Jie Lei",
+      "Tamara Berg",
+      "Mohit Bansal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.29",
+    "point2d": [
+      -61.01979446411133,
+      60.1010856628418
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 31,
+    "title": "Learning with Partial Annotations for Event Detection",
+    "abstract": "Event detection (ED) seeks to discover and classify event instances in plain texts.Previous methods for ED typically adopt supervised learning, requiring fully labeled and high-quality training data.However, in a real-world application, we may not obtain clean training data but only partially labeled one, which could substantially impede the learning process.In this work, we conduct a seminal study for learning with partial annotations for ED.We propose a new trigger localization formulation using contrastive learning to distinguish ground-truth triggers from contexts, showing a decent robustness for addressing partial annotation noise.Impressively, in an extreme scenario where more than 90% of events are unlabeled, our approach achieves an F1 score of over 60%.In addition, we re-annotate and make available two fully annotated subsets of ACE 2005 to serve as an unbiased benchmark for event detection.We hope our approach and data will inspire future studies on this vital yet understudied problem.",
+    "authors": [
+      "Jian Liu",
+      "Dianbo Sui",
+      "Kang Liu",
+      "Haoyan Liu",
+      "Zhe Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.30",
+    "point2d": [
+      51.741058349609375,
+      -38.0834846496582
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 32,
+    "title": "World-to-Words: Grounded Open Vocabulary Acquisition through Fast Mapping in Vision-Language Models",
+    "abstract": "The ability to connect language units to their referents in the physical world, referred to as grounding, is crucial to learning and understanding grounded meanings of words. While humans demonstrate fast mapping in new word learning, it remains unclear whether modern vision-language models can truly represent language with their grounded meanings, and how grounding may further bootstrap new word learning. To this end, we introduce Grounded Open Vocabulary Acquisition (GOVA) to examine grounding and bootstrapping in open-world language learning. As an initial attempt, we propose World-to-Words (W2W), a novel visually-grounded language model by pre-training on image-text pairs highlighting grounding as an objective. Through extensive experiments and analysis, we demonstrate that W2W is a more coherent and fast grounded word learner, and that the grounding ability acquired during pre-training helps the model to learn unseen words more rapidly and robustly.",
+    "authors": [
+      "Ziqiao Ma",
+      "Jiayi Pan",
+      "Joyce Chai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.31",
+    "point2d": [
+      -50.43263626098633,
+      33.87696075439453
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 33,
+    "title": "A Causal Framework to Quantify the Robustness of Mathematical Reasoning with Language Models",
+    "abstract": "We have recently witnessed a number of impressive results on hard mathematical reasoning problems with language models. At the same time, the robustness of these models has also been called into question; recent works have shown that models can rely on shallow patterns in the problem description when generating a solution.Building on the idea of behavioral testing, we propose a novel framework, which pins down the causal effect of various factors in the input, e.g., the surface form of the problem text, the operands, and math operators on the output solution.By grounding the behavioral analysis in a causal graph describing an intuitive reasoning process, we study the behavior of language models in terms of robustness and sensitivity to direct interventions in the input space. We apply our framework on a test bed of math word problems.Our analysis shows that robustness does not appear to continuously improve as a function of size, but the GPT-3 Davinci models (175B) achieve a dramatic improvement in both robustness and sensitivity compared to all other GPT variants.",
+    "authors": [
+      "Alessandro Stolfo",
+      "Zhijing Jin",
+      "Kumar Shridhar",
+      "Bernhard Schoelkopf",
+      "Mrinmaya Sachan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.32",
+    "point2d": [
+      47.092079162597656,
+      -20.11173439025879
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 34,
+    "title": "Evaluating Open-Domain Dialogues in Latent Space with Next Sentence Prediction and Mutual Information",
+    "abstract": "The long-standing one-to-many issue of the open-domain dialogues poses significant challenges for automatic evaluation methods, i.e., there may be multiple suitable responses which differ in semantics for a given conversational context.To tackle this challenge, we propose a novel learning-based automatic evaluation metric (CMN), which can robustly evaluate open-domain dialogues by augmenting Conditional Variational Autoencoders (CVAEs) with a Next Sentence Prediction (NSP) objective and employing Mutual Information (MI) to model the semantic similarity of text in the latent space. Experimental results on two open-domain dialogue datasets demonstrate the superiority of our method compared with a wide range of baselines, especially in handling responses which are distant to the \u201cgolden\u201d reference responses in semantics.",
+    "authors": [
+      "Kun Zhao",
+      "Bohao Yang",
+      "Chenghua Lin",
+      "Wenge Rong",
+      "Aline Villavicencio",
+      "Xiaohui Cui"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.33",
+    "point2d": [
+      13.065266609191895,
+      68.78470611572266
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 35,
+    "title": "Increasing Diversity While Maintaining Accuracy: Text Data Generation with Large Language Models and Human Interventions",
+    "abstract": "Large language models (LLMs) can be used to generate text data for training and evaluating other models. However, creating high-quality datasets with LLMs can be challenging. In this work, we explore human-AI partnerships to facilitate high diversity and accuracy in LLM-based text data generation. We first examine two approaches to diversify text generation: 1) logit suppression, which minimizes the generation of languages that have already been frequently generated, and 2) temperature sampling, which flattens the token sampling probability. We found that diversification approaches can increase data diversity but often at the cost of data accuracy (i.e., text and labels being appropriate for the target domain). To address this issue, we examined two human interventions, 1) label replacement (LR), correcting misaligned labels, and 2) out-of-scope filtering (OOSF), removing instances that are out of the user\u2019s domain of interest or to which no considered label applies. With oracle studies, we found that LR increases the absolute accuracy of models trained with diversified datasets by 14.4%. Moreover, we found that some models trained with data generated with LR interventions outperformed LLM-based few-shot classification. In contrast, OOSF was not effective in increasing model accuracy, implying the need for future work in human-in-the-loop text data generation.",
+    "authors": [
+      "John Chung",
+      "Ece Kamar",
+      "Saleema Amershi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.34",
+    "point2d": [
+      -21.720582962036133,
+      4.421612739562988
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 36,
+    "title": "Pruning Pre-trained Language Models Without Fine-Tuning",
+    "abstract": "To overcome the overparameterized problem in Pre-trained Language Models (PLMs), pruning is widely used as a simple and straightforward compression method by directly removing unimportant weights. Previous first-order methods successfully compress PLMs to extremely high sparsity with little performance drop. These methods, such as movement pruning, use first-order information to prune PLMs while fine-tuning the remaining weights. In this work, we argue fine-tuning is redundant for first-order pruning, since first-order pruning is sufficient to converge PLMs to downstream tasks without fine-tuning. Under this motivation, we propose Static Model Pruning (SMP), which only uses first-order pruning to adapt PLMs to downstream tasks while achieving the target sparsity level. In addition, we also design a new masking function and training objective to further improve SMP. Extensive experiments at various sparsity levels show SMP has significant improvements over first-order and zero-order methods. Unlike previous first-order methods, SMP is also applicable to low sparsity and outperforms zero-order methods. Meanwhile, SMP is more parameter efficient than other methods due to it does not require fine-tuning.",
+    "authors": [
+      "Ting Jiang",
+      "Deqing Wang",
+      "Fuzhen Zhuang",
+      "Ruobing Xie",
+      "Feng Xia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.35",
+    "point2d": [
+      -38.390933990478516,
+      -21.326663970947266
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 37,
+    "title": "When Does Translation Require Context? A Data-driven, Multilingual Exploration",
+    "abstract": "Although proper handling of discourse significantly contributes to the quality of machine translation (MT), these improvements are not adequately measured in common translation quality metrics. Recent works in context-aware MT attempt to target a small set of discourse phenomena during evaluation, however not in a fully systematic way. In this paper, we develop the Multilingual Discourse-Aware (MuDA) benchmark, a series of taggers that identify and evaluate model performance on discourse phenomena in any given dataset. The choice of phenomena is inspired by a novel methodology to systematically identify translations that require context. This methodology confirms the difficulty of previously studied phenomena while uncovering others which were not previously addressed. We find that commonly studied context-aware MT models make only marginal improvements over context-agnostic models, which suggests these models do not handle these ambiguities effectively. We release code and data for 14 language pairs to encourage the MT community to focus on accurately capturing discourse phenomena. Code available at https://github.com/neulab/contextual-mt",
+    "authors": [
+      "Patrick Fernandes",
+      "Kayo Yin",
+      "Emmy Liu",
+      "Andr\u00e9 Martins",
+      "Graham Neubig"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.36",
+    "point2d": [
+      -68.17119598388672,
+      2.6643660068511963
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 38,
+    "title": "Causal Intervention and Counterfactual Reasoning for Multi-modal Fake News Detection",
+    "abstract": "Due to the rapid upgrade of social platforms, most of today\u2019s fake news is published and spread in a multi-modal form. Most existing multi-modal fake news detection methods neglect the fact that some label-specific features learned from the training set cannot generalize well to the testing set, thus inevitably suffering from the harm caused by the latent data bias. In this paper, we analyze and identify the psycholinguistic bias in the text and the bias of inferring news label based on only image features. We mitigate these biases from a causality perspective and propose a Causal intervention and Counterfactual reasoning based Debiasing framework (CCD) for multi-modal fake news detection. To achieve our goal, we first utilize causal intervention to remove the psycholinguistic bias which introduces the spurious correlations between text features and news label. And then, we apply counterfactual reasoning by imagining a counterfactual world where each news has only image features for estimating the direct effect of the image. Therefore we can eliminate the image-only bias by deducting the direct effect of the image from the total effect on labels. Extensive experiments on two real-world benchmark datasets demonstrate the effectiveness of our framework for improving multi-modal fake news detection.",
+    "authors": [
+      "Ziwei Chen",
+      "Linmei Hu",
+      "Weixin Li",
+      "Yingxia Shao",
+      "Liqiang Nie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.37",
+    "point2d": [
+      34.822017669677734,
+      19.21466827392578
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 39,
+    "title": "LexSym: Compositionality as Lexical Symmetry",
+    "abstract": "In tasks like semantic parsing, instruction following, and question answering, standard deep networks fail to generalize compositionally from small datasets. Many existing approaches overcome this limitation with model architectures that enforce a compositional process of sentence interpretation. In this paper, we present a domain-general and model-agnostic formulation of compositionality as a constraint on symmetries of data distributions rather than models. Informally, we prove that whenever a task can be solved by a compositional model, there is a corresponding data augmentation scheme \u2014 a procedure for transforming examples into other well-formed examples \u2014 that imparts compositional inductive bias on any model trained to solve the same task. We describe a procedure called LexSym that discovers these transformations automatically, then applies them to training data for ordinary neural sequence models. Unlike existing compositional data augmentation procedures, LexSym can be deployed agnostically across text, structured data, and even images. It matches or surpasses state-of-the-art, task-specific models on COGS semantic parsing, SCAN and Alchemy instruction following, and CLEVR-CoGenT visual question answering datasets.",
+    "authors": [
+      "Ekin Akyurek",
+      "Jacob Andreas"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.38",
+    "point2d": [
+      -24.838748931884766,
+      -54.80317306518555
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 40,
+    "title": "Layer-wise Fusion with Modality Independence Modeling for Multi-modal Emotion Recognition",
+    "abstract": "Multi-modal emotion recognition has gained increasing attention in recent years due to its widespread applications and the advances in multi-modal learning approaches. However, previous studies primarily focus on developing models that exploit the unification of multiple modalities. In this paper, we propose that maintaining modality independence is beneficial for the model performance. According to this principle, we construct a dataset, and devise a multi-modal transformer model. The new dataset, CHinese Emotion Recognition dataset with Modality-wise Annotions, abbreviated as CHERMA, provides uni-modal labels for each individual modality, and multi-modal labels for all modalities jointly observed. The model consists of uni-modal transformer modules that learn representations for each modality, and a multi-modal transformer module that fuses all modalities. All the modules are supervised by their corresponding labels separately, and the forward information flow is uni-directionally from the uni-modal modules to the multi-modal module. The supervision strategy and the model architecture guarantee each individual modality learns its representation independently, and meanwhile the multi-modal module aggregates all information. Extensive empirical results demonstrate that our proposed scheme outperforms state-of-the-art alternatives, corroborating the importance of modality independence in multi-modal emotion recognition. The dataset and codes are availabel at https://github.com/sunjunaimer/LFMIM",
+    "authors": [
+      "Jun Sun",
+      "Shoukang Han",
+      "Yu-Ping Ruan",
+      "Xiaoning Zhang",
+      "Shu-Kai Zheng",
+      "Yulong Liu",
+      "Yuxin Huang",
+      "Taihao Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.39",
+    "point2d": [
+      -38.82099914550781,
+      63.03227233886719
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 41,
+    "title": "CASN:Class-Aware Score Network for Textual Adversarial Detection",
+    "abstract": "Adversarial detection aims to detect adversarial samples that threaten the security of deep neural networks, which is an essential step toward building robust AI systems. Density-based estimation is widely considered as an effective technique by explicitly modeling the distribution of normal data and identifying adversarial ones as outliers. However, these methods suffer from significant performance degradation when the adversarial samples lie close to the non-adversarial data manifold. To address this limitation, we propose a score-based generative method to implicitly model the data distribution. Our approach utilizes the gradient of the log-density data distribution and calculates the distribution gap between adversarial and normal samples through multi-step iterations using Langevin dynamics. In addition, we use supervised contrastive learning to guide the gradient estimation using label information, which avoids collapsing to a single data manifold and better preserves the anisotropy of the different labeled data distributions. Experimental results on three text classification tasks upon four advanced attack algorithms show that our approach is a significant improvement (average +15.2 F1 score against previous SOTA) over previous detection methods.",
+    "authors": [
+      "Rong Bao",
+      "Rui Zheng",
+      "Liang Ding",
+      "Qi Zhang",
+      "Dacheng Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.40",
+    "point2d": [
+      6.326344966888428,
+      5.551853179931641
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 42,
+    "title": "Do Androids Laugh at Electric Sheep? Humor \u201cUnderstanding\u201d Benchmarks from The New Yorker Caption Contest",
+    "abstract": "Large neural networks can now generate jokes, but do they really \u201cunderstand\u201d humor? We challenge AI models with three tasks derived from the New Yorker Cartoon Caption Contest: matching a joke to a cartoon, identifying a winning caption, and explaining why a winning caption is funny. These tasks encapsulate progressively more sophisticated aspects of \u201cunderstanding\u201d a cartoon; key elements are the complex, often surprising relationships between images and captions and the frequent inclusion of indirect and playful allusions to human experience and culture. We investigate both multimodal and language-only models: the former are challenged with the cartoon images directly, while the latter are given multifaceted descriptions of the visual scene to simulate human-level visual understanding. We find that both types of models struggle at all three tasks. For example, our best multimodal models fall 30 accuracy points behind human performance on the matching task, and, even when provided ground-truth visual scene descriptors, human-authored explanations are preferred head-to-head over the best machine-authored ones (few-shot GPT-4) in more than 2/3 of cases. We release models, code, leaderboard, and corpus, which includes newly-gathered annotations describing the image\u2019s locations/entities, what\u2019s unusual in the scene, and an explanation of the joke.",
+    "authors": [
+      "Jack Hessel",
+      "Ana Marasovic",
+      "Jena D. Hwang",
+      "Lillian Lee",
+      "Jeff Da",
+      "Rowan Zellers",
+      "Robert Mankoff",
+      "Yejin Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.41",
+    "point2d": [
+      -58.664608001708984,
+      49.17190170288086
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 43,
+    "title": "Making More of Little Data: Improving Low-Resource Automatic Speech Recognition Using Data Augmentation",
+    "abstract": "The performance of automatic speech recognition (ASR) systems has advanced substantially in recent years, particularly for languages for which a large amount of transcribed speech is available. Unfortunately, for low-resource languages, such as minority languages, regional languages or dialects, ASR performance generally remains much lower. In this study, we investigate whether data augmentation techniques could help improve low-resource ASR performance, focusing on four typologically diverse minority languages or language variants (West Germanic: Gronings, West-Frisian; Malayo-Polynesian: Besemah, Nasal). For all four languages, we examine the use of self-training, where an ASR system trained with the available human-transcribed data is used to generate transcriptions, which are then combined with the original data to train a new ASR system. For Gronings, for which there was a pre-existing text-to-speech (TTS) system available, we also examined the use of TTS to generate ASR training data from text-only sources. We find that using a self-training approach consistently yields improved performance (a relative WER reduction up to 20.5% compared to using an ASR system trained on 24 minutes of manually transcribed speech). The performance gain from TTS augmentation for Gronings was even stronger (up to 25.5% relative reduction in WER compared to a system based on 24 minutes of manually transcribed speech). In sum, our results show the benefit of using self-training or (if possible) TTS-generated data as an efficient solution to overcome the limitations of data availability for resource-scarce languages in order to improve ASR performance.",
+    "authors": [
+      "Martijn Bartelds",
+      "Nay San",
+      "Bradley McDonnell",
+      "Dan Jurafsky",
+      "Martijn Wieling"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.42",
+    "point2d": [
+      -70.51957702636719,
+      14.894057273864746
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 44,
+    "title": "CLCL: Non-compositional Expression Detection with Contrastive Learning and Curriculum Learning",
+    "abstract": "Non-compositional expressions present a substantial challenge for natural language processing (NLP) systems, necessitating more intricate processing compared to general language tasks, even with large pre-trained language models. Their non-compositional nature and limited availability of data resources further compound the difficulties in accurately learning their representations. This paper addresses both of these challenges. By leveraging contrastive learning techniques to build improved representations it tackles the non-compositionality challenge. Additionally, we propose a dynamic curriculum learning framework specifically designed to take advantage of the scarce available data for modeling non-compositionality. Our framework employs an easy-to-hard learning strategy, progressively optimizing the model\u2019s performance by effectively utilizing available training data. Moreover, we integrate contrastive learning into the curriculum learning approach to maximize its benefits. Experimental results demonstrate the gradual improvement in the model\u2019s performance on idiom usage recognition and metaphor detection tasks. Our evaluation encompasses six datasets, consistently affirming the effectiveness of the proposed framework. Our models available at https://github.com/zhjjn/CLCL.git.",
+    "authors": [
+      "Jianing Zhou",
+      "Ziheng Zeng",
+      "Suma Bhat"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.43",
+    "point2d": [
+      8.740968704223633,
+      -55.24266052246094
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 45,
+    "title": "Multi-VALUE: A Framework for Cross-Dialectal English NLP",
+    "abstract": "Dialect differences caused by regional, social, and economic factors cause performance discrepancies for many groups of language technology users. Inclusive and equitable language technology must critically be dialect invariant, meaning that performance remains constant over dialectal shifts. Current systems often fall short of this ideal since they are designed and tested on a single dialect: Standard American English (SAE). We introduce a suite of resources for evaluating and achieving English dialect invariance. The resource is called Multi-VALUE, a controllable rule-based translation system spanning 50 English dialects and 189 unique linguistic features. Multi-VALUE maps SAE to synthetic forms of each dialect. First, we use this system to stress tests question answering, machine translation, and semantic parsing. Stress tests reveal significant performance disparities for leading models on non-standard dialects. Second, we use this system as a data augmentation technique to improve the dialect robustness of existing systems. Finally, we partner with native speakers of Chicano and Indian English to release new gold-standard variants of the popular CoQA task. To execute the transformation code, run model checkpoints, and download both synthetic and gold-standard dialectal benchmark datasets, see http://value-nlp.org.",
+    "authors": [
+      "Caleb Ziems",
+      "William Held",
+      "Jingfeng Yang",
+      "Jwala Dhamala",
+      "Rahul Gupta",
+      "Diyi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.44",
+    "point2d": [
+      -69.31973266601562,
+      -2.7080774307250977
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 46,
+    "title": "Self-Edit: Fault-Aware Code Editor for Code Generation",
+    "abstract": "Large language models (LLMs) have demonstrated an impressive ability to generate codes on competitive programming tasks. However, with limited sample numbers, LLMs still suffer from poor accuracy. Inspired by the process of human programming, we propose a generate-and-edit approach named Self-Edit that utilizes execution results of the generated code from LLMs to improve the code quality on the competitive programming task. We execute the generated code on the example test case provided in the question and wrap execution results into a supplementary comment. Utilizing this comment as guidance, our fault-aware code editor is employed to correct errors in the generated code.We perform extensive evaluations across two competitive programming datasets with nine different LLMs. Compared to directly generating from LLMs, our approach can improve the average of pass@1 by 89% on APPS-dev, 31% on APPS-test, and 48% on HumanEval over nine popular code generation LLMs with parameter sizes ranging from 110M to 175B. Compared to other post-processing methods, our method demonstrates superior accuracy and efficiency.",
+    "authors": [
+      "Kechi Zhang",
+      "Zhuo Li",
+      "Jia Li",
+      "Ge Li",
+      "Zhi Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.45",
+    "point2d": [
+      -8.890231132507324,
+      -54.054054260253906
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 47,
+    "title": "ColD Fusion: Collaborative Descent for Distributed Multitask Finetuning",
+    "abstract": "Pretraining has been shown to scale well with compute, data size and data diversity. Multitask learning trains on a mixture of supervised datasets and produces improved performance compared to self-supervised pretraining.Until now, massively multitask learning required simultaneous access to all datasets in the mixture and heavy compute resources that are only available to well-resourced teams. In this paper, we propose ColD Fusion, a method that provides the benefits of multitask learning but leverages distributed computation and requires limited communication and no sharing of data. Consequentially, ColD Fusion can create a synergistic loop, where finetuned models can be recycled to continually improve the pretrained model they are based on.We show that ColD Fusion yields comparable benefits to multitask training by producing a model that (a) attains strong performance on all of the datasets it was multitask trained on and (b) is a better starting point for finetuning on unseen datasets. We find ColD Fusion outperforms RoBERTa and even previous multitask models. Specifically, when training and testing on 35 diverse datasets, ColD Fusion-based model outperforms RoBERTa by 2.19 points on average without any changes to the architecture.",
+    "authors": [
+      "Shachar Don-Yehiya",
+      "Elad Venezian",
+      "Colin Raffel",
+      "Noam Slonim",
+      "Leshem Choshen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.46",
+    "point2d": [
+      -35.400238037109375,
+      -12.550765037536621
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 48,
+    "title": "Test-time Adaptation for Machine Translation Evaluation by Uncertainty Minimization",
+    "abstract": "The neural metrics recently received considerable attention from the research community in the automatic evaluation of machine translation. Unlike text-based metrics that have interpretable and consistent evaluation mechanisms for various data sources, the reliability of neural metrics in assessing out-of-distribution data remains a concern due to the disparity between training data and real-world data. This paper aims to address the inference bias of neural metrics through uncertainty minimization during test time, without requiring additional data. Our proposed method comprises three steps: uncertainty estimation, test-time adaptation, and inference. Specifically, the model employs the prediction uncertainty of the current data as a signal to update a small fraction of parameters during test time and subsequently refine the prediction through optimization. To validate our approach, we apply the proposed method to three representative models and conduct experiments on the WMT21 benchmarks. The results obtained from both in-domain and out-of-distribution evaluations consistently demonstrate improvements in correlation performance across different models. Furthermore, we provide evidence that the proposed method effectively reduces model uncertainty. The code is publicly available at https://github.com/NLP2CT/TaU.",
+    "authors": [
+      "Runzhe Zhan",
+      "Xuebo Liu",
+      "Derek F. Wong",
+      "Cuilian Zhang",
+      "Lidia S. Chao",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.47",
+    "point2d": [
+      -75.00631713867188,
+      -5.343696594238281
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 49,
+    "title": "Multi-CLS BERT: An Efficient Alternative to Traditional Ensembling",
+    "abstract": "Ensembling BERT models often significantly improves accuracy, but at the cost of significantly more computation and memory footprint. In this work, we propose Multi-CLS BERT, a novel ensembling method for CLS-based prediction tasks that is almost as efficient as a single BERT model. Multi-CLS BERT uses multiple CLS tokens with a parameterization and objective that encourages their diversity. Thus instead of fine-tuning each BERT model in an ensemble (and running them all at test time), we need only fine-tune our single Multi-CLS BERT model (and run the one model at test time, ensembling just the multiple final CLS embeddings). To test its effectiveness, we build Multi-CLS BERT on top of a state-of-the-art pretraining method for BERT (Aroca-Ouellette and Rudzicz, 2020). In experiments on GLUE and SuperGLUE we show that our Multi-CLS BERT reliably improves both overall accuracy and confidence estimation. When only 100 training samples are available in GLUE, the Multi-CLS BERT_Base model can even outperform the corresponding BERT_Large model. We analyze the behavior of our Multi-CLS BERT, showing that it has many of the same characteristics and behavior as a typical BERT 5-way ensemble, but with nearly 4-times less computation and memory.",
+    "authors": [
+      "Haw-Shiuan Chang",
+      "Ruei-Yao Sun",
+      "Kathryn Ricci",
+      "Andrew McCallum"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.48",
+    "point2d": [
+      -33.68107223510742,
+      -7.33212947845459
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 50,
+    "title": "On-the-fly Cross-lingual Masking for Multilingual Pre-training",
+    "abstract": "In multilingual pre-training with the objective of MLM (masked language modeling) on multiple monolingual corpora, multilingual models only learn cross-linguality implicitly from isomorphic spaces formed by overlapping different language spaces due to the lack of explicit cross-lingual forward pass. In this work, we present CLPM (Cross-lingual Prototype Masking), a dynamic and token-wise masking scheme, for multilingual pre-training, using a special token [\\mathcal{C}]_{x} to replace a random token x in the input sentence. [\\mathcal{C}]_{x} is a cross-lingual prototype for x and then forms an explicit cross-lingual forward pass. We instantiate CLPM for the multilingual pre-training phase of UNMT (unsupervised neural machine translation), and experiments show that CLPM can consistently improve the performance of UNMT models on \\{De, Ro, Ne \\} \\leftrightarrow En. Beyond UNMT or bilingual tasks, we show that CLPM can consistently improve the performance of multilingual models on cross-lingual classification.",
+    "authors": [
+      "Xi Ai",
+      "Bin Fang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.49",
+    "point2d": [
+      -60.356868743896484,
+      -13.430316925048828
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 51,
+    "title": "How About Kind of Generating Hedges using End-to-End Neural Models?",
+    "abstract": "Hedging is a strategy for softening the impact of a statement in conversation. In reducing the strength of an expression, it may help to avoid embarrassment (more technically, \u201cface threat\u201d) to one\u2019s listener. For this reason, it is often found in contexts of instruction, such as tutoring. In this work, we develop a model of hedge generation based on i) fine-tuning state-of-the-art language models trained on human-human tutoring data, followed by ii) reranking to select the candidate that best matches the expected hedging strategy within a candidate pool using a hedge classifier. We apply this method to a natural peer-tutoring corpus containing a significant number of disfluencies, repetitions, and repairs. The results show that generation in this noisy environment is feasible with reranking. By conducting an error analysis for both approaches, we reveal the challenges faced by systems attempting to accomplish both social and task-oriented goals in conversation.",
+    "authors": [
+      "Alafate Abulimiti",
+      "Chlo\u00e9 Clavel",
+      "Justine Cassell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.50",
+    "point2d": [
+      5.621829986572266,
+      54.81218719482422
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 52,
+    "title": "DiffusionDB: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models",
+    "abstract": "With recent advancements in diffusion models, users can generate high-quality images by writing text prompts in natural language. However, generating images with desired details requires proper prompts, and it is often unclear how a model reacts to different prompts or what the best prompts are. To help researchers tackle these critical challenges, we introduce DiffusionDB, the first large-scale text-to-image prompt dataset totaling 6.5TB, containing 14 million images generated by Stable Diffusion, 1.8 million unique prompts, and hyperparameters specified by real users. We analyze the syntactic and semantic characteristics of prompts. We pinpoint specific hyperparameter values and prompt styles that can lead to model errors and present evidence of potentially harmful model usage, such as the generation of misinformation. The unprecedented scale and diversity of this human-actuated dataset provide exciting research opportunities in understanding the interplay between prompts and generative models, detecting deepfakes, and designing human-AI interaction tools to help users more easily use these models. DiffusionDB is publicly available at: https://poloclub.github.io/diffusiondb.",
+    "authors": [
+      "Zijie J. Wang",
+      "Evan Montoya",
+      "David Munechika",
+      "Haoyang Yang",
+      "Benjamin Hoover",
+      "Duen Horng Chau"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.51",
+    "point2d": [
+      -62.44669723510742,
+      46.99876022338867
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 53,
+    "title": "From Key Points to Key Point Hierarchy: Structured and Expressive Opinion Summarization",
+    "abstract": "Key Point Analysis (KPA) has been recently proposed for deriving fine-grained insights from collections of textual comments. KPA extracts the main points in the data as a list of concise sentences or phrases, termed Key Points, and quantifies their prevalence. While key points are more expressive than word clouds and key phrases, making sense of a long, flat list of key points, which often express related ideas in varying levels of granularity, may still be challenging. To address this limitation of KPA, we introduce the task of organizing a given set of key points into a hierarchy, according to their specificity. Such hierarchies may be viewed as a novel type of Textual Entailment Graph. We develop ThinkP, a high quality benchmark dataset of key point hierarchies for business and product reviews, obtained by consolidating multiple annotations. We compare different methods for predicting pairwise relations between key points, and for inferring a hierarchy from these pairwise predictions. In particular, for the task of computing pairwise key point relations, we achieve significant gains over existing strong baselines by applying directional distributional similarity methods to a novel distributional representation of key points, and further boost performance via weak supervision.",
+    "authors": [
+      "Arie Cattan",
+      "Lilach Eden",
+      "Yoav Kantor",
+      "Roy Bar-Haim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.52",
+    "point2d": [
+      -1.9435086250305176,
+      39.19080352783203
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 54,
+    "title": "When to Use What: An In-Depth Comparative Empirical Analysis of OpenIE Systems for Downstream Applications",
+    "abstract": "Open Information Extraction (OpenIE) has been used in the pipelines of various NLP tasks. Unfortunately, there is no clear consensus on which models to use in which tasks. Muddying things further is the lack of comparisons that take differing training sets into account. In this paper, we present an application-focused empirical survey of neural OpenIE models, training sets, and benchmarks in an effort to help users choose the most suitable OpenIE systems for their applications. We find that the different assumptions made by different models and datasets have a statistically significant effect on performance, making it important to choose the most appropriate model for one\u2019s applications. We demonstrate the applicability of our recommendations on a downstream Complex QA application.",
+    "authors": [
+      "Kevin Pei",
+      "Ishan Jindal",
+      "Kevin Chen-Chuan Chang",
+      "ChengXiang Zhai",
+      "Yunyao Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.53",
+    "point2d": [
+      17.95454216003418,
+      8.055238723754883
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 55,
+    "title": "Subjective Crowd Disagreements for Subjective Data: Uncovering Meaningful CrowdOpinion with Population-level Learning",
+    "abstract": "Human-annotated data plays a critical role in the fairness of AI systems, including those that deal with life-altering decisions or moderating human-created web/social media content. Conventionally, annotator disagreements are resolved before any learning takes place. However, researchers are increasingly identifying annotator disagreement as pervasive and meaningful. They also question the performance of a system when annotators disagree. Particularly when minority views are disregarded, especially among groups that may already be underrepresented in the annotator population. In this paper, we introduce CrowdOpinion, an unsupervised learning based approach that uses language features and label distributions to pool similar items into larger samples of label distributions. We experiment with four generative and one density-based clustering method, applied to five linear combinations of label distributions and features. We use five publicly available benchmark datasets (with varying levels of annotator disagreements) from social media (Twitter, Gab, and Reddit). We also experiment in the wild using a dataset from Facebook, where annotations come from the platform itself by users reacting to posts. We evaluate CrowdOpinion as a label distribution prediction task using KL-divergence and a single-label problem using accuracy measures.",
+    "authors": [
+      "Tharindu Cyril Weerasooriya",
+      "Sarah Luger",
+      "Saloni Poddar",
+      "Ashiqur KhudaBukhsh",
+      "Christopher Homan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.54",
+    "point2d": [
+      22.975086212158203,
+      25.47743034362793
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 56,
+    "title": "Post-Abstention: Towards Reliably Re-Attempting the Abstained Instances in QA",
+    "abstract": "Despite remarkable progress made in natural language processing, even the state-of-the-art models often make incorrect predictions. Such predictions hamper the reliability of systems and limit their widespread adoption in real-world applications. \u2018Selective prediction\u2019 partly addresses the above concern by enabling models to abstain from answering when their predictions are likely to be incorrect. While selective prediction is advantageous, it leaves us with a pertinent question \u2018what to do after abstention\u2019. To this end, we present an explorative study on \u2018Post-Abstention\u2019, a task that allows re-attempting the abstained instances with the aim of increasing **coverage** of the system without significantly sacrificing its **accuracy**. We first provide mathematical formulation of this task and then explore several methods to solve it. Comprehensive experiments on 11 QA datasets show that these methods lead to considerable risk improvements \u2013performance metric of the Post-Abstention task\u2013 both in the in-domain and the out-of-domain settings. We also conduct a thorough analysis of these results which further leads to several interesting findings. Finally, we believe that our work will encourage and facilitate further research in this important area of addressing the reliability of NLP systems.",
+    "authors": [
+      "Neeraj Varshney",
+      "Chitta Baral"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.55",
+    "point2d": [
+      62.22390365600586,
+      6.400487899780273
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 57,
+    "title": "UniLG: A Unified Structure-aware Framework for Lyrics Generation",
+    "abstract": "As a special task of natural language generation, conditional lyrics generation needs to consider the structure of generated lyrics and the relationship between lyrics and music. Due to various forms of conditions, a lyrics generation system is expected to generate lyrics conditioned on different signals, such as music scores, music audio, or partially-finished lyrics, etc. However, most of the previous works have ignored the musical attributes hidden behind the lyrics and the structure of the lyrics. Additionally, most works only handle limited lyrics generation conditions, such as lyrics generation based on music score or partial lyrics, they can not be easily extended to other generation conditions with the same framework.In this paper, we propose a unified structure-aware lyrics generation framework named UniLG. Specifically, we design compound templates that incorporate textual and musical information to improve structure modeling and unify the different lyrics generation conditions.Extensive experiments demonstrate the effectiveness of our framework. Both objective and subjective evaluations show significant improvements in generating structural lyrics.",
+    "authors": [
+      "Tao Qian",
+      "Fan Lou",
+      "Jiatong Shi",
+      "Yuning Wu",
+      "Shuai Guo",
+      "Xiang Yin",
+      "Qin Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.56",
+    "point2d": [
+      -33.22892379760742,
+      43.306034088134766
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 58,
+    "title": "FC-KBQA: A Fine-to-Coarse Composition Framework for Knowledge Base Question Answering",
+    "abstract": "The generalization problem on KBQA has drawn considerable attention. Existing research suffers from the generalization issue brought by the entanglement in the coarse-grained modeling of the logical expression, or inexecutability issues due to the fine-grained modeling of disconnected classes and relations in real KBs. We propose a Fine-to-Coarse Composition framework for KBQA (FC-KBQA) to both ensure the generalization ability and executability of the logical expression. The main idea of FC-KBQA is to extract relevant fine-grained knowledge components from KB and reformulate them into middle-grained knowledge pairs for generating the final logical expressions. FC-KBQA derives new state-of-the-art performance on GrailQA and WebQSP, and runs 4 times faster than the baseline. Our code is now available at GitHub https://github. com/RUCKBReasoning/FC-KBQA.",
+    "authors": [
+      "Lingxi Zhang",
+      "Jing Zhang",
+      "Yanling Wang",
+      "Shulin Cao",
+      "Xinmei Huang",
+      "Cuiping Li",
+      "Hong Chen",
+      "Juanzi Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.57",
+    "point2d": [
+      69.19601440429688,
+      1.4201706647872925
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 59,
+    "title": "Does GPT-3 Grasp Metaphors? Identifying Metaphor Mappings with Generative Language Models",
+    "abstract": "Conceptual metaphors present a powerful cognitive vehicle to transfer knowledge structures from a source to a target domain. Prior neural approaches focus on detecting whether natural language sequences are metaphoric or literal. We believe that to truly probe metaphoric knowledge in pre-trained language models, their capability to detect this transfer should be investigated. To this end, this paper proposes to probe the ability of GPT-3 to detect metaphoric language and predict the metaphor\u2019s source domain without any pre-set domains. We experiment with different training sample configurations for fine-tuning and few-shot prompting on two distinct datasets. When provided 12 few-shot samples in the prompt, GPT-3 generates the correct source domain for a new sample with an accuracy of 65.15% in English and 34.65% in Spanish. GPT\u2019s most common error is a hallucinated source domain for which no indicator is present in the sentence. Other common errors include identifying a sequence as literal even though a metaphor is present and predicting the wrong source domain based on specific words in the sequence that are not metaphorically related to the target domain.",
+    "authors": [
+      "Lennart Wachowiak",
+      "Dagmar Gromann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.58",
+    "point2d": [
+      8.21931266784668,
+      -56.286163330078125
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 60,
+    "title": "Being Right for Whose Right Reasons?",
+    "abstract": "Explainability methods are used to benchmark the extent to which model predictions align with human rationales i.e., are \u2018right for the right reasons\u2019. Previous work has failed to acknowledge, however, that what counts as a rationale is sometimes subjective. This paper presents what we think is a first of its kind, a collection of human rationale annotations augmented with the annotators demographic information. We cover three datasets spanning sentiment analysis and common-sense reasoning, and six demographic groups (balanced across age and ethnicity). Such data enables us to ask both what demographics our predictions align with and whose reasoning patterns our models\u2019 rationales align with. We find systematic inter-group annotator disagreement and show how 16 Transformer-based models align better with rationales provided by certain demographic groups: We find that models are biased towards aligning best with older and/or white annotators. We zoom in on the effects of model size and model distillation, finding \u2013contrary to our expectations\u2013 negative correlations between model size and rationale agreement as well as no evidence that either model size or model distillation improves fairness.",
+    "authors": [
+      "Terne Sasha Thorn Jakobsen",
+      "Laura Cabello",
+      "Anders S\u00f8gaard"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.59",
+    "point2d": [
+      31.908809661865234,
+      -6.019438743591309
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 61,
+    "title": "ALERT: Adapt Language Models to Reasoning Tasks",
+    "abstract": "Recent advancements in large language models have enabled them to perform well on complex tasks that require step-by-step reasoning with few-shot learning. However, it is unclear whether these models are applying reasoning skills they have learnt during pre-training , or if they are simply memorizing their training corpus at finer granularity and have learnt to better understand their context.To address this question, we introduce {pasted macro \u2018OUR\u2019}model, a benchmark and suite of analyses for evaluating reasoning skills of language models. {pasted macro \u2018OUR\u2019}model enables comparing pre-trained and finetuned models on complex tasks that require reasoning skills to solve. Our benchmark provides a test bed to asses any language model on fine-grained reasoning skills, which spans over 20 datasets and covers 10 different reasoning skills. By using {pasted macro \u2018OUR\u2019}model we further investigate the role of finetuning. Our extensive empirical analysis shows that language models learn more reasoning skills such as textual entailment, abductive reasoning, and analogical reasoning during the finetuning stage compared to pretraining stage. However, we also find that when language models are finetuned they tend to overfit to the prompt template, which hurts the robustness of models causing generalization problems.",
+    "authors": [
+      "Ping Yu",
+      "Tianlu Wang",
+      "Olga Golovneva",
+      "Badr AlKhamissi",
+      "Siddharth Verma",
+      "Zhijing Jin",
+      "Gargi Ghosh",
+      "Mona Diab",
+      "Asli Celikyilmaz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.60",
+    "point2d": [
+      43.117549896240234,
+      -14.620807647705078
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 62,
+    "title": "Glot500: Scaling Multilingual Corpora and Language Models to 500 Languages",
+    "abstract": "The NLP community has mainly focused on scaling Large Language Models (LLMs) vertically, i.e., making them better for about 100 languages. We instead scale LLMs horizontally: we create, through continued pretraining, Glot500-m, an LLM that covers 511 predominantly low-resource languages. An important part of this effort is to collect and clean Glot500-c, a corpus that covers these 511 languages and allows us to train Glot500-m. We evaluate Glot500-m on five diverse tasks across these languages. We observe large improvements for both high-resource and low-resource languages compared to an XLM-R baseline. Our analysis shows that no single factor explains the quality of multilingual LLM representations. Rather, a combination of factors determines quality including corpus size, script, \u201chelp\u201d from related languages and the total capacity of the model. Our work addresses an important goal of NLP research: we should notlimit NLP to a small fraction of the world\u2019s languages and instead strive to support as many languages as possible to bring the benefits of NLP technology to all languages and cultures. Code, data and models are available at https://github.com/cisnlp/Glot500.",
+    "authors": [
+      "Ayyoob ImaniGooghari",
+      "Peiqin Lin",
+      "Amir Hossein Kargaran",
+      "Silvia Severini",
+      "Masoud Jalili Sabet",
+      "Nora Kassner",
+      "Chunlan Ma",
+      "Helmut Schmid",
+      "Andr\u00e9 Martins",
+      "Fran\u00e7ois Yvon",
+      "Hinrich Sch\u00fctze"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.61",
+    "point2d": [
+      -30.412960052490234,
+      -31.726783752441406
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 63,
+    "title": "Joint Constrained Learning with Boundary-adjusting for Emotion-Cause Pair Extraction",
+    "abstract": "Emotion-Cause Pair Extraction (ECPE) aims to identify the document\u2019s emotion clauses and corresponding cause clauses. Like other relation extraction tasks, ECPE is closely associated with the relationship between sentences. Recent methods based on Graph Convolutional Networks focus on how to model the multiplex relations between clauses by constructing different edges. However, the data of emotions, causes, and pairs are extremely unbalanced, and current methods get their representation using the same graph structure. In this paper, we propose a **J**oint **C**onstrained Learning framework with **B**oundary-adjusting for Emotion-Cause Pair Extraction (**JCB**). Specifically, through constrained learning, we summarize the prior rules existing in the data and force the model to take them into consideration in optimization, which helps the model learn a better representation from unbalanced data. Furthermore, we adjust the decision boundary of classifiers according to the relations between subtasks, which have always been ignored. No longer working independently as in the previous framework, the classifiers corresponding to three subtasks cooperate under the relation constraints. Experimental results show that **JCB** obtains competitive results compared with state-of-the-art methods and prove its robustness on unbalanced data.",
+    "authors": [
+      "Huawen Feng",
+      "Junlong Liu",
+      "Junhao Zheng",
+      "Haibin Chen",
+      "Xichen Shang",
+      "Qianli Ma"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.62",
+    "point2d": [
+      46.1104736328125,
+      -53.98152160644531
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 64,
+    "title": "Pretrained Bidirectional Distillation for Machine Translation",
+    "abstract": "Knowledge transfer can boost neural machine translation (NMT), for example, by finetuning a pretrained masked language model (LM). However, it may suffer from the forgetting problem and the structural inconsistency between pretrained LMs and NMT models. Knowledge distillation (KD) may be a potential solution to alleviate these issues, but few studies have investigated language knowledge transfer from pretrained language models to NMT models through KD. In this paper, we propose Pretrained Bidirectional Distillation (PBD) for NMT, which aims to efficiently transfer bidirectional language knowledge from masked language pretraining to NMT models. Its advantages are reflected in efficiency and effectiveness through a globally defined and bidirectional context-aware distillation objective. Bidirectional language knowledge of the entire sequence is transferred to an NMT model concurrently during translation training. Specifically, we propose self-distilled masked language pretraining to obtain the PBD objective. We also design PBD losses to efficiently distill the language knowledge, in the form of token probabilities, to the encoder and decoder of an NMT model using the PBD objective. Extensive experiments reveal that pretrained bidirectional distillation can significantly improve machine translation performance and achieve competitive or even better results than previous pretrain-finetune or unified multilingual translation methods in supervised, unsupervised, and zero-shot scenarios. Empirically, it is concluded that pretrained bidirectional distillation is an effective and efficient method for transferring language knowledge from pretrained language models to NMT models.",
+    "authors": [
+      "Yimeng Zhuang",
+      "Mei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.63",
+    "point2d": [
+      -62.47697448730469,
+      -12.803709030151367
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 65,
+    "title": "Pivotal Role of Language Modeling in Recommender Systems: Enriching Task-specific and Task-agnostic Representation Learning",
+    "abstract": "Recent studies have proposed unified user modeling frameworks that leverage user behavior data from various applications. Many of them benefit from utilizing users\u2019 behavior sequences as plain texts, representing rich information in any domain or system without losing generality. Hence, a question arises: Can language modeling for user history corpus help improve recommender systems? While its versatile usability has been widely investigated in many domains, its applications to recommender systems still remain underexplored. We show that language modeling applied directly to task-specific user histories achieves excellent results on diverse recommendation tasks. Also, leveraging additional task-agnostic user histories delivers significant performance benefits. We further demonstrate that our approach can provide promising transfer learning capabilities for a broad spectrum of real-world recommender systems, even on unseen domains and services.",
+    "authors": [
+      "Kyuyong Shin",
+      "Hanock Kwak",
+      "Wonjae Kim",
+      "Jisu Jeong",
+      "Seungjae Jung",
+      "Kyungmin Kim",
+      "Jung-Woo Ha",
+      "Sang-Woo Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.64",
+    "point2d": [
+      3.0410351753234863,
+      34.22880172729492
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 66,
+    "title": "Improving Continual Relation Extraction by Distinguishing Analogous Semantics",
+    "abstract": "Continual relation extraction (RE) aims to learn constantly emerging relations while avoiding forgetting the learned relations. Existing works store a small number of typical samples to re-train the model for alleviating forgetting. However, repeatedly replaying these samples may cause the overfitting problem. We conduct an empirical study on existing works and observe that their performance is severely affected by analogous relations. To address this issue, we propose a novel continual extraction model for analogous relations. Specifically, we design memory-insensitive relation prototypes and memory augmentation to overcome the overfitting problem. We also introduce integrated training and focal knowledge distillation to enhance the performance on analogous relations. Experimental results show the superiority of our model and demonstrate its effectiveness in distinguishing analogous relations and overcoming overfitting.",
+    "authors": [
+      "Wenzheng Zhao",
+      "Yuanning Cui",
+      "Wei Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.65",
+    "point2d": [
+      39.31782150268555,
+      -62.1197509765625
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 67,
+    "title": "Improving Pretraining Techniques for Code-Switched NLP",
+    "abstract": "Pretrained models are a mainstay in modern NLP applications. Pretraining requires access to large volumes of unlabeled text. While monolingual text is readily available for many of the world\u2019s languages, access to large quantities of code-switched text (i.e., text with tokens of multiple languages interspersed within a sentence) is much more scarce. Given this resource constraint, the question of how pretraining using limited amounts of code-switched text could be altered to improve performance for code-switched NLP becomes important to tackle. In this paper, we explore different masked language modeling (MLM) pretraining techniques for code-switched text that are cognizant of language boundaries prior to masking. The language identity of the tokens can either come from human annotators, trained language classifiers, or simple relative frequency-based estimates. We also present an MLM variant by introducing a residual connection from an earlier layer in the pretrained model that uniformly boosts performance on downstream tasks. Experiments on two downstream tasks, Question Answering (QA) and Sentiment Analysis (SA), involving four code-switched language pairs (Hindi-English, Spanish-English, Tamil-English, Malayalam-English) yield relative improvements of up to 5.8 and 2.7 F1 scores on QA (Hindi-English) and SA (Tamil-English), respectively, compared to standard pretraining techniques. To understand our task improvements better, we use a series of probes to study what additional information is encoded by our pretraining techniques and also introduce an auxiliary loss function that explicitly models language identification to further aid the residual MLM variants.",
+    "authors": [
+      "Richeek Das",
+      "Sahasra Ranjan",
+      "Shreya Pathak",
+      "Preethi Jyothi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.66",
+    "point2d": [
+      -28.95199966430664,
+      -29.176128387451172
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 68,
+    "title": "A Theory of Unsupervised Speech Recognition",
+    "abstract": "Unsupervised speech recognition ({pasted macro \u2018ASRU\u2019}/) is the problem of learning automatic speech recognition (ASR) systems from unpaired speech-only and text-only corpora. While various algorithms exist to solve this problem, a theoretical framework is missing to study their properties and address such issues as sensitivity to hyperparameters and training instability. In this paper, we proposed a general theoretical framework to study the properties of {pasted macro \u2018ASRU\u2019}/ systems based on random matrix theory and the theory of neural tangent kernels. Such a framework allows us to prove various learnability conditions and sample complexity bounds of {pasted macro \u2018ASRU\u2019}/. Extensive {pasted macro \u2018ASRU\u2019}/ experiments on synthetic languages with three classes of transition graphs provide strong empirical evidence for our theory (code available at https://github.com/cactuswiththoughts/UnsupASRTheory.gitcactuswiththoughts/UnsupASRTheory.git).",
+    "authors": [
+      "Liming Wang",
+      "Mark Hasegawa-Johnson",
+      "Chang Yoo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.67",
+    "point2d": [
+      -65.56981658935547,
+      15.007123947143555
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 69,
+    "title": "ThinkSum: Probabilistic reasoning over sets using large language models",
+    "abstract": "Large language models (LLMs) have a substantial capacity for high-level analogical reasoning: reproducing patterns in linear text that occur in their training data (zero-shot evaluation) or in the provided context (few-shot in-context learning). However, recent studies show that even the more advanced LLMs fail in scenarios that require reasoning over multiple objects or facts and making sequences of logical deductions. We propose a two-stage probabilistic inference paradigm, ThinkSum, which reasons over sets of objects or facts in a structured manner. In the first stage (Think \u2013 retrieval of associations), a LLM is queried in parallel over a set of phrases extracted from the prompt or an auxiliary model call. In the second stage (Sum \u2013 probabilistic inference or reasoning), the results of these queries are aggregated to make the final prediction. We demonstrate the possibilities and advantages of ThinkSum on the BIG-bench suite of LLM evaluation tasks, achieving improvements over the state of the art using GPT-family models on thirteen difficult tasks, often with far smaller model variants. We also compare and contrast ThinkSum with other proposed modifications to direct prompting of LLMs, such as variants of chain-of-thought prompting. Our results suggest that because the probabilistic inference in ThinkSum is performed outside of calls to the LLM, ThinkSum is less sensitive to prompt design, yields more interpretable predictions, and can be flexibly combined with latent variable models to extract structured knowledge from LLMs. Overall, our proposed paradigm represents a promising approach for enhancing the reasoning capabilities of LLMs.",
+    "authors": [
+      "Batu Ozturkler",
+      "Nikolay Malkin",
+      "Zhen Wang",
+      "Nebojsa Jojic"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.68",
+    "point2d": [
+      45.870113372802734,
+      -13.02156925201416
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 70,
+    "title": "NLG Evaluation Metrics Beyond Correlation Analysis: An Empirical Metric Preference Checklist",
+    "abstract": "In this study, we analyze automatic evaluation metrics for Natural Language Generation (NLG), specifically task-agnostic metrics and human-aligned metrics. Task-agnostic metrics, such as Perplexity, BLEU, BERTScore, are cost-effective and highly adaptable to diverse NLG tasks, yet they have a weak correlation with human. Human-aligned metrics (CTC, CtrlEval, UniEval) improves correlation level by incorporating desirable human-like qualities as training objective. However, their effectiveness at discerning system-level performance and quality of system outputs remain unclear.We present metric preference checklist as a framework to assess the effectiveness of automatic metrics in three NLG tasks: Text Summarization, Dialogue Response Generation, and Controlled Generation. Our proposed framework provides access: (i) for verifying whether automatic metrics are faithful to human preference, regardless of their correlation level to human; and (ii) for inspecting the strengths and limitations of NLG systems via pairwise evaluation. We show that automatic metrics provide a better guidance than human on discriminating system-level performance in Text Summarization and Controlled Generation tasks. We also show that multi-aspect human-aligned metric (UniEval) is not necessarily dominant over single-aspect human-aligned metrics (CTC, CtrlEval) and task-agnostic metrics (BLEU, BERTScore), particularly in Controlled Generation tasks.",
+    "authors": [
+      "Iftitahu Nimah",
+      "Meng Fang",
+      "Vlado Menkovski",
+      "Mykola Pechenizkiy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.69",
+    "point2d": [
+      -18.370561599731445,
+      18.89144515991211
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 71,
+    "title": "DialoGPS: Dialogue Path Sampling in Continuous Semantic Space for Data Augmentation in Multi-Turn Conversations",
+    "abstract": "In open-domain dialogue generation tasks, contexts and responses in most datasets are one-to-one mapped, violating an important many-to-many characteristic: a context leads to various responses, and a response answers multiple contexts. Without such patterns, models poorly generalize and prefer responding safely. Many attempts have been made in either multi-turn settings from a one-to-many perspective or in a many-to-many perspective but limited to single-turn settings. The major challenge to many-to-many augment multi-turn dialogues is that discretely replacing each turn with semantic similarity breaks fragile context coherence. In this paper, we propose DialoGue Path Sampling (DialoGPS) method in continuous semantic space, the first many-to-many augmentation method for multi-turn dialogues. Specifically, we map a dialogue to our extended Brownian Bridge, a special Gaussian process. We sample latent variables to form coherent dialogue paths in the continuous space. A dialogue path corresponds to a new multi-turn dialogue and is used as augmented training data. We show the effect of DialoGPS with both automatic and human evaluation.",
+    "authors": [
+      "Ang Lv",
+      "Jinpeng Li",
+      "Yuhan Chen",
+      "Gao Xing",
+      "Ji Zhang",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.70",
+    "point2d": [
+      10.02076530456543,
+      67.39103698730469
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 72,
+    "title": "TECHS: Temporal Logical Graph Networks for Explainable Extrapolation Reasoning",
+    "abstract": "Extrapolation reasoning on temporal knowledge graphs (TKGs) aims to forecast future facts based on past counterparts. There are two main challenges: (1) incorporating the complex information, including structural dependencies, temporal dynamics, and hidden logical rules; (2) implementing differentiable logical rule learning and reasoning for explainability. To this end, we propose an explainable extrapolation reasoning framework TEemporal logiCal grapH networkS (TECHS), which mainly contains a temporal graph encoder and a logical decoder. The former employs a graph convolutional network with temporal encoding and heterogeneous attention to embed topological structures and temporal dynamics. The latter integrates propositional reasoning and first-order reasoning by introducing a reasoning graph that iteratively expands to find the answer. A forward message-passing mechanism is also proposed to update node representations, and their propositional and first-order attention scores. Experimental results demonstrate that it outperforms state-of-the-art baselines.",
+    "authors": [
+      "Qika Lin",
+      "Jun Liu",
+      "Rui Mao",
+      "Fangzhi Xu",
+      "Erik Cambria"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.71",
+    "point2d": [
+      58.72733688354492,
+      -60.38014221191406
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 73,
+    "title": "Consistency Regularization Training for Compositional Generalization",
+    "abstract": "Existing neural models have difficulty generalizing to unseen combinations of seen components. To achieve compositional generalization, models are required to consistently interpret (sub)expressions across contexts. Without modifying model architectures, we improve the capability of Transformer on compositional generalization through consistency regularization training, which promotes representation consistency across samples and prediction consistency for a single sample. Experimental results on semantic parsing and machine translation benchmarks empirically demonstrate the effectiveness and generality of our method. In addition, we find that the prediction consistency scores on in-distribution validation sets can be an alternative for evaluating models during training, when commonly-used metrics are not informative.",
+    "authors": [
+      "Yongjing Yin",
+      "Jiali Zeng",
+      "Yafu Li",
+      "Fandong Meng",
+      "Jie Zhou",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.72",
+    "point2d": [
+      -26.828031539916992,
+      -54.05757141113281
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 74,
+    "title": "NUWA-XL: Diffusion over Diffusion for eXtremely Long Video Generation",
+    "abstract": "In this paper, we propose NUWA-XL, a novel Diffusion over Diffusion architecture for eXtremely Long video generation. Most current work generates long videos segment by segment sequentially, which normally leads to the gap between training on short videos and inferring long videos, and the sequential generation is inefficient. Instead, our approach adopts a \u201ccoarse-to-fine\u201d process, in which the video can be generated in parallel at the same granularity. A global diffusion model is applied to generate the keyframes across the entire time range, and then local diffusion models recursively fill in the content between nearby frames. This simple yet effective strategy allows us to directly train on long videos (3376 frames) to reduce the training-inference gap and makes it possible to generate all segments in parallel. To evaluate our model, we build FlintstonesHD dataset, a new benchmark for long video generation. Experiments show that our model not only generates high-quality long videos with both global and local coherence, but also decreases the average inference time from 7.55min to 26s (by 94.26%) at the same hardware setting when generating 1024 frames. The homepage link is [NUWA-XL](https://msra-nuwa.azurewebsites.net)",
+    "authors": [
+      "Shengming Yin",
+      "Chenfei Wu",
+      "Huan Yang",
+      "Jianfeng Wang",
+      "Xiaodong Wang",
+      "Minheng Ni",
+      "Zhengyuan Yang",
+      "Linjie Li",
+      "Shuguang Liu",
+      "Fan Yang",
+      "Jianlong Fu",
+      "Ming Gong",
+      "Lijuan Wang",
+      "Zicheng Liu",
+      "Houqiang Li",
+      "Nan Duan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.73",
+    "point2d": [
+      -64.9515380859375,
+      53.199546813964844
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 75,
+    "title": "Synthetic Text Generation with Differential Privacy: A Simple and Practical Recipe",
+    "abstract": "Privacy concerns have attracted increasing attention in data-driven products due to the tendency of machine learning models to memorize sensitive training data. Generating synthetic versions of such data with a formal privacy guarantee, such as differential privacy (DP), provides a promising path to mitigating these privacy concerns, but previous approaches in this direction have typically failed to produce synthetic data of high quality. In this work, we show that a simple and practical recipe in the text domain is effective: simply fine-tuning a pretrained generative language model with DP enables the model to generate useful synthetic text with strong privacy protection. Through extensive empirical analyses on both benchmark and private customer data, we demonstrate that our method produces synthetic text that is competitive in terms of utility with its non-private counterpart, meanwhile providing strong protection against potential privacy leakages.",
+    "authors": [
+      "Xiang Yue",
+      "Huseyin Inan",
+      "Xuechen Li",
+      "Girish Kumar",
+      "Julia McAnallen",
+      "Hoda Shajari",
+      "Huan Sun",
+      "David Levitan",
+      "Robert Sim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.74",
+    "point2d": [
+      -0.9142873287200928,
+      14.812359809875488
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 76,
+    "title": "A Close Look into the Calibration of Pre-trained Language Models",
+    "abstract": "Pre-trained language models (PLMs) may fail in giving reliable estimates of their predictive uncertainty. We take a close look into this problem, aiming to answer two questions: (1) Do PLMs learn to become calibrated in the training process? (2) How effective are existing calibration methods? For the first question, we conduct fine-grained control experiments to study the dynamic change in PLMs\u2019 calibration performance in training. We consider six factors as control variables, including dataset difficulty, available training samples, training steps, the number of tunable parameters, model scale, and pretraining. We observe a consistent change in calibration performance across six factors. We find that PLMs don\u2019t learn to become calibrated in training, evidenced by the continual increase in confidence, no matter whether the predictions are correct or not. We highlight that our finding somewhat contradicts two established conclusions: (a) Larger PLMs are more calibrated; (b) Pretraining improves model calibration. Next, we study the effectiveness of existing calibration methods in mitigating the overconfidence issue. Besides unlearnable calibration methods (e.g., label smoothing), we adapt and extend two recently proposed learnable methods that directly collect data to train models to have reasonable confidence estimations. Experimental results show that learnable methods significantly reduce PLMs\u2019 confidence in wrong predictions.",
+    "authors": [
+      "Yangyi Chen",
+      "Lifan Yuan",
+      "Ganqu Cui",
+      "Zhiyuan Liu",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.75",
+    "point2d": [
+      -30.11121940612793,
+      -15.533650398254395
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 77,
+    "title": "DIONYSUS: A Pre-trained Model for Low-Resource Dialogue Summarization",
+    "abstract": "Dialogue summarization has recently garnered significant attention due to its wide range of applications. However, existing methods for summarizing dialogues have limitations because they do not take into account the inherent structure of dialogue and rely heavily on labeled data, which can lead to poor performance in new domains. In this work, we propose DIONYSUS (dynamic input optimization in pre-training for dialogue summarization), a pre-trained encoder-decoder model for summarizing dialogues in any new domain. To pre-train DIONYSUS, we create two pseudo summaries for each dialogue example: one from a fine-tuned summarization model and the other from important dialogue turns. We then choose one of these pseudo summaries based on information distribution differences in different types of dialogues. This selected pseudo summary serves as the objective for pre-training DIONYSUS using a self-supervised approach on a large dialogue corpus. Our experiments show that DIONYSUS outperforms existing methods on six datasets, as demonstrated by its ROUGE scores in zero-shot and few-shot settings",
+    "authors": [
+      "Yu Li",
+      "Baolin Peng",
+      "Pengcheng He",
+      "Michel Galley",
+      "Zhou Yu",
+      "Jianfeng Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.76",
+    "point2d": [
+      -3.4325883388519287,
+      53.46904754638672
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 78,
+    "title": "MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction",
+    "abstract": "Given a text query, the task of Natural Language Video Localization (NLVL) is to localize a temporal moment in an untrimmed video that semantically matches the query. In this paper, we adopt a proposal-based solution that generates proposals (i.e. candidate moments) and then select the best matching proposal. On top of modeling the cross-modal interaction between candidate moments and the query, our proposed Moment Sampling DETR (MS-DETR) enables efficient moment-moment relation modeling. The core idea is to sample a subset of moments guided by the learnable templates with an adopted DETR framework. To achieve this, we design a multi-scale visual-linguistic encoder, and an anchor-guided moment decoder paired with a set of learnable templates. Experimental results on three public datasets demonstrate the superior performance of MS-DETR.",
+    "authors": [
+      "Wang Jing",
+      "Aixin Sun",
+      "Hao Zhang",
+      "Xiaoli Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.77",
+    "point2d": [
+      -59.570167541503906,
+      60.37327575683594
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 79,
+    "title": "Diverse Demonstrations Improve In-context Compositional Generalization",
+    "abstract": "In-context learning has shown great success in i.i.d semantic parsing splits, where the training and test sets are drawn from the same distribution. In this setup, models are typically prompted with demonstrations that are similar to the input utterance. However, in the setup of compositional generalization, where models are tested on outputs with structures that are absent from the training set, selecting similar demonstrations is insufficient, as often no example will be similar enough to the input. In this work, we propose a method to select diverse demonstrations that aims to collectively cover all of the structures required in the output program, in order to encourage the model to generalize to new structures from these demonstrations. We empirically show that combining diverse demonstrations with in-context learning substantially improves performance across three compositional generalization semantic parsing datasets in the pure in-context learning setup and when combined with finetuning.",
+    "authors": [
+      "Itay Levy",
+      "Ben Bogin",
+      "Jonathan Berant"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.78",
+    "point2d": [
+      -26.605737686157227,
+      -54.9971809387207
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 80,
+    "title": "Self-Adaptive In-Context Learning: An Information Compression Perspective for In-Context Example Selection and Ordering",
+    "abstract": "Despite the surprising few-shot performance of in-context learning (ICL), it is still a common practice to randomly sample examples to serve as context. This paper advocates a new principle for ICL: self-adaptive in-context learning. The self-adaption mechanism is introduced to help each sample find an in-context example organization (i.e., selection and permutation) that can derive the correct prediction, thus maximizing performance. To validate the effectiveness of self-adaptive ICL, we propose a general select-then-rank framework and instantiate it with new selection and ranking algorithms. Upon extensive evaluation on eight different NLP datasets, our self-adaptive ICL method achieves a 40% relative improvement over the common practice setting. Further analysis reveals the enormous potential of self-adaptive ICL that it might be able to close the gap between ICL and finetuning given more advanced algorithms. Our code will be released to facilitate future research.",
+    "authors": [
+      "Zhiyong Wu",
+      "Yaoxiang Wang",
+      "Jiacheng Ye",
+      "Lingpeng Kong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.79",
+    "point2d": [
+      -10.29226303100586,
+      -24.4996337890625
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 81,
+    "title": "On the Efficacy of Sampling Adapters",
+    "abstract": "Sampling-based decoding strategies are widely employed for generating text from probabilistic models, yet standard ancestral sampling often results in text that is degenerate or incoherent. To alleviate this issue, various modifications to a model\u2019s sampling distribution, such as top-p or top-k sampling, have been introduced and are now ubiquitously used in language generation systems. We propose a unified framework for understanding these techniques, which we term sampling adapters. Sampling adapters often lead to qualitatively better text, which raises the question: From a formal perspective, how are they changing the token-level distributions of language generation models? And why do these local changes lead to higher-quality text? We argue that the shift they enforce can be viewed as a trade-off between precision and recall: while the model loses its ability to produce certain strings, its precision rate on desirable text increases. While this trade-off is not reflected in standard metrics of distribution quality (such as perplexity), we find that several precision-emphasizing measures indeed indicate that sampling adapters can lead to probability distributions more aligned with the true distribution. Further, these measures correlate with higher sequence-level quality scores, specifically, Mauve.",
+    "authors": [
+      "Clara Meister",
+      "Tiago Pimentel",
+      "Luca Malagutti",
+      "Ethan Wilcox",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.80",
+    "point2d": [
+      -26.413625717163086,
+      16.32440757751465
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 82,
+    "title": "Cross-Domain Data Augmentation with Domain-Adaptive Language Modeling for Aspect-Based Sentiment Analysis",
+    "abstract": "Cross-domain Aspect-Based Sentiment Analysis (ABSA) aims to leverage the useful knowledge from a source domain to identify aspect-sentiment pairs in sentences from a target domain. To tackle the task, several recent works explore a new unsupervised domain adaptation framework, i.e., Cross-Domain Data Augmentation (CDDA), aiming to directly generate much labeled target-domain data based on the labeled source-domain data. However, these CDDA methods still suffer from several issues: 1) preserving many source-specific attributes such as syntactic structures; 2) lack of fluency and coherence; 3) limiting the diversity of generated data. To address these issues, we propose a new cross-domain Data Augmentation approach based on Domain-Adaptive Language Modeling named DA^2LM, which contains three stages: 1) assigning pseudo labels to unlabeled target-domain data; 2) unifying the process of token generation and labeling with a Domain-Adaptive Language Model (DALM) to learn the shared context and annotation across domains; 3) using the trained DALM to generate labeled target-domain data. Experiments show that DA^2LM consistently outperforms previous feature adaptation and CDDA methods on both ABSA and Aspect Extraction tasks. The source code is publicly released at https://github.com/NUSTM/DALM.",
+    "authors": [
+      "Jianfei Yu",
+      "Qiankun Zhao",
+      "Rui Xia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.81",
+    "point2d": [
+      15.735419273376465,
+      -33.0475959777832
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 83,
+    "title": "Compositional Data Augmentation for Abstractive Conversation Summarization",
+    "abstract": "Recent abstractive conversation summarization systems generally rely on large-scale datasets with annotated summaries. However, collecting and annotating these conversations can be a time-consuming and labor-intensive task. To address this issue, in this work, we present a sub-structure level compositional data augmentation method, Compo, for generating diverse and high-quality pairs of conversations and summaries. Specifically, Compo first extracts conversation structures like topic splits and action triples as basic units. Then we organize these semantically meaningful conversation snippets compositionally to create new training instances.Additionally, we explore noise-tolerant settings in both self-training and joint-training paradigms to make the most of these augmented samples. Our experiments on benchmark datasets, SAMSum and DialogSum, show that Compo substantially outperforms prior baseline methods by achieving a nearly 10% increase of ROUGE scores with limited data. Code is available at https://github.com/ozyyshr/Compo.",
+    "authors": [
+      "Siru Ouyang",
+      "Jiaao Chen",
+      "Jiawei Han",
+      "Diyi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.82",
+    "point2d": [
+      -3.987501859664917,
+      54.46512985229492
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 84,
+    "title": "PMAES: Prompt-mapping Contrastive Learning for Cross-prompt Automated Essay Scoring",
+    "abstract": "Current cross-prompt automated essay scoring (AES) is a challenging task due to the large discrepancies between different prompts, such as different genres and expressions. The main goal of current cross-prompt AES systems is to learn enough shared features between the source and target prompts to grade well on the target prompt. However, because the features are captured based on the original prompt representation, they may be limited by being extracted directly between essays. In fact, when the representations of two prompts are more similar, we can gain more shared features between them. Based on this motivation, in this paper, we propose a learning strategy called \u201cprompt-mapping\u201d to learn about more consistent representations of source and target prompts. In this way, we can obtain more shared features between the two prompts and use them to better represent the essays for the target prompt. Experimental results on the ASAP++ dataset demonstrate the effectiveness of our method. We also design experiments in different settings to show that our method can be applied in different scenarios. Our code is available at https://github.com/gdufsnlp/PMAES.",
+    "authors": [
+      "Yuan Chen",
+      "Xia Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.83",
+    "point2d": [
+      -14.042813301086426,
+      24.96453094482422
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 85,
+    "title": "Marked Personas: Using Natural Language Prompts to Measure Stereotypes in Language Models",
+    "abstract": "To recognize and mitigate harms from large language models (LLMs), we need to understand the prevalence and nuances of stereotypes in LLM outputs. Toward this end, we present Marked Personas, a prompt-based method to measure stereotypes in LLMs for intersectional demographic groups without any lexicon or data labeling.Grounded in the sociolinguistic concept of markedness (which characterizes explicitly linguistically marked categories versus unmarked defaults), our proposed method is twofold: 1) prompting an LLM to generate personas, i.e., natural language descriptions, of the target demographic group alongside personas of unmarked, default groups; 2) identifying the words that significantly distinguish personas of the target group from corresponding unmarked ones.We find that the portrayals generated by GPT-3.5 and GPT-4 contain higher rates of racial stereotypes than human-written portrayals using the same prompts. The words distinguishing personas of marked (non-white, non-male) groups reflect patterns of othering and exoticizing these demographics. An intersectional lens further reveals tropes that dominate portrayals of marginalized groups, such as tropicalism and the hypersexualization of minoritized women. These representational harms have concerning implications for downstream applications like story generation.",
+    "authors": [
+      "Myra Cheng",
+      "Esin Durmus",
+      "Dan Jurafsky"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.84",
+    "point2d": [
+      24.736217498779297,
+      33.07784652709961
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 86,
+    "title": "On Prefix-tuning for Lightweight Out-of-distribution Detection",
+    "abstract": "Out-of-distribution (OOD) detection, a fundamental task vexing real-world applications, has attracted growing attention in the NLP community. Recently fine-tuning based methods have made promising progress. However, it could be costly to store fine-tuned models for each scenario. In this paper, we depart from the classic fine-tuning based OOD detection toward a parameter-efficient alternative, and propose an unsupervised prefix-tuning based OOD detection framework termed PTO. Additionally, to take advantage of optional training data labels and targeted OOD data, two practical extensions of PTO are further proposed. Overall, PTO and its extensions offer several key advantages of being lightweight, easy-to-reproduce, and theoretically justified. Experimental results show that our methods perform comparably to, even better than, existing fine-tuning based OOD detection approaches under a wide range of metrics, detection settings, and OOD types.",
+    "authors": [
+      "Yawen Ouyang",
+      "Yongchang Cao",
+      "Yuan Gao",
+      "Zhen Wu",
+      "Jianbing Zhang",
+      "Xinyu Dai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.85",
+    "point2d": [
+      -1.4989300966262817,
+      -5.989083290100098
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 87,
+    "title": "GEC-DePenD: Non-Autoregressive Grammatical Error Correction with Decoupled Permutation and Decoding",
+    "abstract": "Grammatical error correction (GEC) is an important NLP task that is currently usually solved with autoregressive sequence-to-sequence models. However, approaches of this class are inherently slow due to one-by-one token generation, so non-autoregressive alternatives are needed. In this work, we propose a novel non-autoregressive approach to GEC that decouples the architecture into a permutation network that outputs a self-attention weight matrix that can be used in beam search to find the best permutation of input tokens (with auxiliary <ins> tokens) and a decoder network based on a step-unrolled denoising autoencoder that fills in specific tokens. This allows us to find the token permutation after only one forward pass of the permutation network, avoiding autoregressive constructions. We show that the resulting network improves over previously known non-autoregressive methods for GEC and reaches the level of autoregressive methods that do not use language-specific synthetic data generation methods. Our results are supported by a comprehensive experimental validation on the ConLL-2014 and BEA datasets and an extensive ablation study that supports our architectural and algorithmic choices.",
+    "authors": [
+      "Konstantin Yakovlev",
+      "Alexander Podolskiy",
+      "Andrey Bout",
+      "Sergey Nikolenko",
+      "Irina Piontkovskaya"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.86",
+    "point2d": [
+      -36.40522384643555,
+      12.994097709655762
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 88,
+    "title": "Measuring Progress in Fine-grained Vision-and-Language Understanding",
+    "abstract": "While pretraining on large-scale image\u2013text data from the Web has facilitated rapid progress on many vision-and-language (V&L) tasks, recent work has demonstrated that pretrained models lack \u201cfine-grained\u201d understanding, such as the ability to recognise relationships, verbs, and numbers in images. This has resulted in an increased interest in the community to either develop new benchmarks or models for such capabilities. To better understand and quantify progress in this direction, we investigate four competitive V&L models on four fine-grained benchmarks. Through our analysis, we find that X-VLM (Zeng et al., 2022) consistently outperforms other baselines, and that modelling innovations can impact performance more than scaling Web data, which even degrades performance sometimes. Through a deeper investigation of X-VLM, we highlight the importance of both novel losses and rich data sources for learning fine-grained skills. Finally, we inspect training dynamics, and discover that for some tasks, performance peaks early in training or significantly fluctuates, never converging.",
+    "authors": [
+      "Emanuele Bugliarello",
+      "Laurent Sartran",
+      "Aishwarya Agrawal",
+      "Lisa Anne Hendricks",
+      "Aida Nematzadeh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.87",
+    "point2d": [
+      -52.24321365356445,
+      32.71247100830078
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 89,
+    "title": "Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation Incorporating Gloss Information",
+    "abstract": "Visual Word Sense Disambiguation (VWSD) is a task to find the image that most accurately depicts the correct sense of the target word for the given context. Previously, image-text matching models often suffered from recognizing polysemous words. This paper introduces an unsupervised VWSD approach that uses gloss information of an external lexical knowledge-base, especially the sense definitions. Specifically, we suggest employing Bayesian inference to incorporate the sense definitions when sense information of the answer is not provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we propose a context-aware definition generation with GPT-3. Experimental results show that the VWSD performance significantly increased with our Bayesian inference-based approach. In addition, our context-aware definition generation achieved prominent performance improvement in OOD examples exhibiting better performance than the existing definition generation method.",
+    "authors": [
+      "Sunjae Kwon",
+      "Rishabh Garodia",
+      "Minhwa Lee",
+      "Zhichao Yang",
+      "Hong Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.88",
+    "point2d": [
+      -48.96210861206055,
+      35.273738861083984
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 90,
+    "title": "Chain-of-Skills: A Configurable Model for Open-Domain Question Answering",
+    "abstract": "The retrieval model is an indispensable component for real-world knowledge-intensive tasks, e.g., open-domain question answering (ODQA). As separate retrieval skills are annotated for different datasets, recent work focuses on customized methods, limiting the model transfer- ability and scalability. In this work, we propose a modular retriever where individual modules correspond to key skills that can be reused across datasets. Our approach supports flexible skill configurations based on the target domain to boost performance. To mitigate task interference, we design a novel modularization parameterization inspired by sparse Transformer. We demonstrate that our model can benefit from self-supervised pretraining on Wikipedia and fine-tuning using multiple ODQA datasets, both in a multi-task fashion. Our approach outperforms recent self-supervised retrievers in zero-shot evaluations and achieves state-of-the-art fine-tuned retrieval performance on NQ, HotpotQA and OTT-QA.",
+    "authors": [
+      "Kaixin Ma",
+      "Hao Cheng",
+      "Yu Zhang",
+      "Xiaodong Liu",
+      "Eric Nyberg",
+      "Jianfeng Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.89",
+    "point2d": [
+      7.538712501525879,
+      -10.410717964172363
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 91,
+    "title": "Elaboration-Generating Commonsense Question Answering at Scale",
+    "abstract": "In question answering requiring common sense, language models (e.g., GPT-3) have been used to generate text expressing background knowledge that helps improve performance. Yet the cost of working with such models is very high; in this work, we finetune smaller language models to generate useful intermediate context, referred to here as elaborations. Our framework alternates between updating two language models\u2014an elaboration generator and an answer predictor\u2014allowing each to influence the other. Using less than 0.5% of the parameters of GPT-3, our model outperforms alternatives with similar sizes and closes the gap with GPT-3 on four commonsense question answering benchmarks. Human evaluations show that the quality of the generated elaborations is high.",
+    "authors": [
+      "Wenya Wang",
+      "Vivek Srikumar",
+      "Hannaneh Hajishirzi",
+      "Noah A. Smith"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.90",
+    "point2d": [
+      65.527099609375,
+      5.062660217285156
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 92,
+    "title": "Neural Unsupervised Reconstruction of Protolanguage Word Forms",
+    "abstract": "We present a state-of-the-art neural approach to the unsupervised reconstruction of ancient word forms. Previous work in this domain used expectation-maximization to predict simple phonological changes between ancient word forms and their cognates in modern languages. We extend this work with neural models that can capture more complicated phonological and morphological changes. At the same time, we preserve the inductive biases from classical methods by building monotonic alignment constraints into the model and deliberately underfitting during the maximization step. We evaluate our performance on the task of reconstructing Latin from a dataset of cognates across five Romance languages, achieving a notable reduction in edit distance from the target word forms compared to previous methods.",
+    "authors": [
+      "Andre He",
+      "Nicholas Tomlin",
+      "Dan Klein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.91",
+    "point2d": [
+      -40.40733337402344,
+      -46.744834899902344
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 93,
+    "title": "DaMSTF: Domain Adversarial Learning Enhanced Meta Self-Training for Domain Adaptation",
+    "abstract": "Self-training emerges as an important research line on domain adaptation. By taking the model\u2019s prediction as the pseudo labels of the unlabeled data, self-training bootstraps the model with pseudo instances in the target domain. However, the prediction errors of pseudo labels (label noise) challenge the performance of self-training. To address this problem, previous approaches only use reliable pseudo instances, i.e., pseudo instances with high prediction confidence, to retrain the model. Although these strategies effectively reduce the label noise, they are prone to miss the hard examples. In this paper, we propose a new self-training framework for domain adaptation, namely Domain adversarial learning enhanced Self-Training Framework (DaMSTF). Firstly, DaMSTF involves meta-learning to estimate the importance of each pseudo instance, so as to simultaneously reduce the label noise and preserve hard examples. Secondly, we design a meta constructor for constructing the meta-validation set, which guarantees the effectiveness of the meta-learning module by improving the quality of the meta-validation set. Thirdly, we find that the meta-learning module suffers from the training guidance vanish- ment and tends to converge to an inferior optimal. To this end, we employ domain adversarial learning as a heuristic neural network initialization method, which can help the meta-learning module converge to a better optimal. Theoretically and experimentally, we demonstrate the effectiveness of the proposed DaMSTF. On the cross-domain sentiment classification task, DaMSTF improves the performance of BERT with an average of nearly 4%.",
+    "authors": [
+      "Menglong Lu",
+      "Zhen Huang",
+      "Yunxiang Zhao",
+      "Zhiliang Tian",
+      "Yang Liu",
+      "Dongsheng Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.92",
+    "point2d": [
+      3.8945839405059814,
+      1.762939691543579
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 94,
+    "title": "On Evaluating Multilingual Compositional Generalization with Translated Datasets",
+    "abstract": "Compositional generalization allows efficient learning and human-like inductive biases. Since most research investigating compositional generalization in NLP is done on English, important questions remain underexplored. Do the necessary compositional generalization abilities differ across languages? Can models compositionally generalize cross-lingually? As a first step to answering these questions, recent work used neural machine translation to translate datasets for evaluating compositional generalization in semantic parsing. However, we show that this entails critical semantic distortion. To address this limitation, we craft a faithful rule-based translation of the MCWQ dataset from English to Chinese and Japanese. Even with the resulting robust benchmark, which we call MCWQ-R, we show that the distribution of compositions still suffers due to linguistic divergences, and that multilingual models still struggle with cross-lingual compositional generalization. Our dataset and methodology will serve as useful resources for the study of cross-lingual compositional generalization in other tasks.",
+    "authors": [
+      "Zi Wang",
+      "Daniel Hershcovich"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.93",
+    "point2d": [
+      -26.968202590942383,
+      -53.53063201904297
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 95,
+    "title": "FAA: Fine-grained Attention Alignment for Cascade Document Ranking",
+    "abstract": "Document ranking aims at sorting a collection of documents with their relevance to a query. Contemporary methods explore more efficient transformers or divide long documents into passages to handle the long input. However, intensive query-irrelevant content may lead to harmful distraction and high query latency. Some recent works further propose cascade document ranking models that extract relevant passages with an efficient selector before ranking, however, their selection and ranking modules are almost independently optimized and deployed, leading to selecting error reinforcement and sub-optimal performance. In fact, the document ranker can provide fine-grained supervision to make the selector more generalizable and compatible, and the selector built upon a different structure can offer a distinct perspective to assist in document ranking. Inspired by this, we propose a fine-grained attention alignment approach to jointly optimize a cascade document ranking model. Specifically, we utilize the attention activations over the passages from the ranker as fine-grained attention feedback to optimize the selector. Meanwhile, we fuse the relevance scores from the passage selector into the ranker to assist in calculating the cooperative matching representation. Experiments on MS MARCO and TREC DL demonstrate the effectiveness of our method.",
+    "authors": [
+      "Zhen Li",
+      "Chongyang Tao",
+      "Jiazhan Feng",
+      "Tao Shen",
+      "Dongyan Zhao",
+      "Xiubo Geng",
+      "Daxin Jiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.94",
+    "point2d": [
+      16.603759765625,
+      -16.460935592651367
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 96,
+    "title": "Fine-tuning Happens in Tiny Subspaces: Exploring Intrinsic Task-specific Subspaces of Pre-trained Language Models",
+    "abstract": "Pre-trained language models (PLMs) are known to be overly parameterized and have significant redundancy, indicating a small degree of freedom of the PLMs. Motivated by the observation, in this paper, we study the problem of re-parameterizing and fine-tuning PLMs from a new perspective: Discovery of intrinsic task-specific subspace. Specifically, by exploiting the dynamics of the fine-tuning process for a given task, the parameter optimization trajectory is learned to uncover its intrinsic task-specific subspace. A key finding is that PLMs can be effectively fine-tuned in the subspace with a small number of free parameters. Beyond, we observe some outlier dimensions emerging during fine-tuning in the subspace. Disabling these dimensions degrades the model performance significantly. This suggests that these dimensions are crucial to induce task-specific knowledge to downstream tasks.",
+    "authors": [
+      "Zhong Zhang",
+      "Bang Liu",
+      "Junming Shao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.95",
+    "point2d": [
+      -34.12518310546875,
+      -16.57915496826172
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 97,
+    "title": "Facilitating Multi-turn Emotional Support Conversation with Positive Emotion Elicitation: A Reinforcement Learning Approach",
+    "abstract": "Emotional support conversation (ESC) aims to provide emotional support (ES) to improve one\u2019s mental state. Existing works stay at fitting grounded responses and responding strategies (e.g., question), which ignore the effect on ES and lack explicit goals to guide emotional positive transition. To this end, we introduce a new paradigm to formalize multi-turn ESC as a process of positive emotion elicitation. Addressing this task requires finely adjusting the elicitation intensity in ES as the conversation progresses while maintaining conversational goals like coherence. In this paper, we propose Supporter, a mixture-of-expert-based reinforcement learning model, and well design ES and dialogue coherence rewards to guide policy\u2019s learning for responding. Experiments verify the superiority of Supporter in achieving positive emotion elicitation during responding while maintaining conversational goals including coherence.",
+    "authors": [
+      "Jinfeng Zhou",
+      "Zhuang Chen",
+      "Bo Wang",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.96",
+    "point2d": [
+      26.91204071044922,
+      67.02578735351562
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 98,
+    "title": "Query Enhanced Knowledge-Intensive Conversation via Unsupervised Joint Modeling",
+    "abstract": "In this paper, we propose an unsupervised query enhanced approach for knowledge-intensive conversations, namely QKConv. There are three modules in QKConv: a query generator, an off-the-shelf knowledge selector, and a response generator. QKConv is optimized through joint training, which produces the response by exploring multiple candidate queries and leveraging corresponding selected knowledge. The joint training solely relies on the dialogue context and target response, getting exempt from extra query annotations or knowledge provenances. To evaluate the effectiveness of the proposed QKConv, we conduct experiments on three representative knowledge-intensive conversation datasets: conversational question-answering, task-oriented dialogue, and knowledge-grounded conversation. Experimental results reveal that QKConv performs better than all unsupervised methods across three datasets and achieves competitive performance compared to supervised methods.",
+    "authors": [
+      "Mingzhu Cai",
+      "Siqi Bao",
+      "Xin Tian",
+      "Huang He",
+      "Fan Wang",
+      "Hua Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.97",
+    "point2d": [
+      18.587556838989258,
+      55.912330627441406
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 99,
+    "title": "Why Aren\u2019t We NER Yet? Artifacts of ASR Errors in Named Entity Recognition in Spontaneous Speech Transcripts",
+    "abstract": "Transcripts of spontaneous human speech present a significant obstacle for traditional NER models. The lack of grammatical structure of spoken utterances and word errors introduced by the ASR make downstream NLP tasks challenging. In this paper, we examine in detail the complex relationship between ASR and NER errors which limit the ability of NER models to recover entity mentions from spontaneous speech transcripts. Using publicly available benchmark datasets (SWNE, Earnings-21, OntoNotes), we present the full taxonomy of ASR-NER errors and measure their true impact on entity recognition. We find that NER models fail spectacularly even if no word errors are introduced by the ASR. We also show why the F1 score is inadequate to evaluate NER models on conversational transcripts.",
+    "authors": [
+      "Piotr Szyma\u0144ski",
+      "Lukasz Augustyniak",
+      "Mikolaj Morzy",
+      "Adrian Szymczak",
+      "Krzysztof Surdyk",
+      "Piotr \u017belasko"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.98",
+    "point2d": [
+      28.541553497314453,
+      -82.08296203613281
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 100,
+    "title": "Precise Zero-Shot Dense Retrieval without Relevance Labels",
+    "abstract": "While dense retrieval has been shown to be effective and efficient across tasks and languages, it remains difficult to create effective fully zero-shot dense retrieval systems when no relevance labels are available. In this paper, we recognize the difficulty of zero-shot learning and encoding relevance. Instead, we propose to pivot through Hypothetical Document Embeddings (HyDE). Given a query, HyDE first zero-shot prompts an instruction-following language model (e.g., InstructGPT) to generate a hypothetical document. The document captures relevance patterns but is \u201cfake\u201d and may contain hallucinations. Then, an unsupervised contrastively learned encoder (e.g., Contriever) encodes the document into an embedding vector. This vector identifies a neighborhood in the corpus embedding space, from which similar real documents are retrieved based on vector similarity. This second step grounds the generated document to the actual corpus, with the encoder\u2019s dense bottleneck filtering out the hallucinations. Our experiments show that HyDE significantly outperforms the state-of-the-art unsupervised dense retriever Contriever and shows strong performance comparable to fine-tuned retrievers across various tasks (e.g. web search, QA, fact verification) and in non-English languages (e.g., sw, ko, ja, bn).",
+    "authors": [
+      "Luyu Gao",
+      "Xueguang Ma",
+      "Jimmy Lin",
+      "Jamie Callan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.99",
+    "point2d": [
+      13.050399780273438,
+      -12.499276161193848
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 101,
+    "title": "White-Box Multi-Objective Adversarial Attack on Dialogue Generation",
+    "abstract": "Pre-trained transformers are popular in state-of-the-art dialogue generation (DG) systems. Such language models are, however, vulnerable to various adversarial samples as studied in traditional tasks such as text classification, which inspires our curiosity about their robustness in DG systems. One main challenge of attacking DG models is that perturbations on the current sentence can hardly degrade the response accuracy because the unchanged chat histories are also considered for decision-making. Instead of merely pursuing pitfalls of performance metrics such as BLEU, ROUGE, we observe that crafting adversarial samples to force longer generation outputs benefits attack effectiveness\u2014the generated responses are typically irrelevant, lengthy, and repetitive. To this end, we propose a white-box multi-objective attack method called DGSlow. Specifically, DGSlow balances two objectives\u2014generation accuracy and length, via a gradient-based multi-objective optimizer and applies an adaptive searching mechanism to iteratively craft adversarial samples with only a few modifications. Comprehensive experiments on four benchmark datasets demonstrate that DGSlow could significantly degrade state-of-the-art DG models with a higher success rate than traditional accuracy-based methods. Besides, our crafted sentences also exhibit strong transferability in attacking other models.",
+    "authors": [
+      "Yufei Li",
+      "Zexin Li",
+      "Yingfan Gao",
+      "Cong Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.100",
+    "point2d": [
+      7.19231653213501,
+      60.50331115722656
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 102,
+    "title": "A Cautious Generalization Goes a Long Way: Learning Morphophonological Rules",
+    "abstract": "Explicit linguistic knowledge, encoded by resources such as rule-based morphological analyzers, continues to prove useful in downstream NLP tasks, especially for low-resource languages and dialects. Rules are an important asset in descriptive linguistic grammars. However, creating such resources is usually expensive and non-trivial, especially for spoken varieties with no written standard. In this work, we present a novel approach for automatically learning morphophonological rules of Arabic from a corpus. Motivated by classic cognitive models for rule learning, rules are generalized cautiously. Rules that are memorized for individual items are only allowed to generalize to unseen forms if they are sufficiently reliable in the training data.The learned rules are further examined to ensure that they capture true linguistic phenomena described by domain experts. We also investigate the learnability of rules in low-resource settings across different experimental setups and dialects.",
+    "authors": [
+      "Salam Khalifa",
+      "Sarah Payne",
+      "Jordan Kodner",
+      "Ellen Broselow",
+      "Owen Rambow"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.101",
+    "point2d": [
+      -34.60078811645508,
+      -45.09737777709961
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 103,
+    "title": "Few-shot Adaptation Works with UnpredicTable Data",
+    "abstract": "Prior work on language models (LMs) shows that training on a large number of diverse tasks improves few-shot learning (FSL) performance on new tasks. We take this to the extreme, automatically extracting 413,299 tasks from internet tables - orders of magnitude more than the next-largest public datasets. Finetuning on the resulting dataset leads to improved FSL performance on Natural Language Processing (NLP) tasks, but not proportionally to dataset scale. In fact, we find that narrow subsets of our dataset sometimes outperform more diverse datasets. For example, finetuning on software documentation from support.google.com raises FSL performance by a mean of +7.5% on 52 downstream tasks, which beats training on 40 human-curated NLP datasets (+6.7%). Finetuning on various narrow datasets leads to similar broad improvements across test tasks, suggesting that the gains are not from domain adaptation but adapting to FSL in general. We do not observe clear patterns between the datasets that lead to FSL gains, leaving open questions about why certain data helps with FSL.",
+    "authors": [
+      "Jun Shern Chan",
+      "Michael Pieler",
+      "Jonathan Jao",
+      "J\u00e9r\u00e9my Scheurer",
+      "Ethan Perez"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.102",
+    "point2d": [
+      -17.70990753173828,
+      -8.250693321228027
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 104,
+    "title": "Cross-lingual Science Journalism: Select, Simplify and Rewrite Summaries for Non-expert Readers",
+    "abstract": "Automating Cross-lingual Science Journalism (CSJ) aims to generate popular science summaries from English scientific texts for non-expert readers in their local language. We introduce CSJ as a downstream task of text simplification and cross-lingual scientific summarization to facilitate science journalists\u2019 work. We analyze the performance of possible existing solutions as baselines for the CSJ task. Based on these findings, we propose to combine the three components - SELECT, SIMPLIFY and REWRITE (SSR) to produce cross-lingual simplified science summaries for non-expert readers. Our empirical evaluation on the Wikipedia dataset shows that SSR significantly outperforms the baselines for the CSJ task and can serve as a strong baseline for future work. We also perform an ablation study investigating the impact of individual components of SSR. Further, we analyze the performance of SSR on a high-quality, real-world CSJ dataset with human evaluation and in-depth analysis, demonstrating the superior performance of SSR for CSJ.",
+    "authors": [
+      "Mehwish Fatima",
+      "Michael Strube"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.103",
+    "point2d": [
+      15.145410537719727,
+      16.93873405456543
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 105,
+    "title": "HuCurl: Human-induced Curriculum Discovery",
+    "abstract": "We introduce the problem of curriculum discovery and describe a curriculum learning framework capable of discovering effective curricula in a curriculum space based on prior knowledge about sample difficulty. Using annotation entropy and loss as measures of difficulty, we show that (i): the top-performing discovered curricula for a given model and dataset are often non-monotonic as apposed to monotonic curricula in existing literature, (ii): the prevailing easy-to-hard or hard-to-easy transition curricula are often at the risk of underperforming, and (iii): the curricula discovered for smaller datasets and models perform well on larger datasets and models respectively. The proposed framework encompasses some of the existing curriculum learning approaches and can discover curricula that outperform them across several NLP tasks.",
+    "authors": [
+      "Mohamed Elgaar",
+      "Hadi Amiri"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.104",
+    "point2d": [
+      -12.078643798828125,
+      -40.43029022216797
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 106,
+    "title": "kNN-TL: k-Nearest-Neighbor Transfer Learning for Low-Resource Neural Machine Translation",
+    "abstract": "Transfer learning has been shown to be an effective technique for enhancing the performance of low-resource neural machine translation (NMT). This is typically achieved through either fine-tuning a child model with a pre-trained parent model, or by utilizing the out- put of the parent model during the training of the child model. However, these methods do not make use of the parent knowledge during the child inference, which may limit the translation performance. In this paper, we propose a k-Nearest-Neighbor Transfer Learning (kNN-TL) approach for low-resource NMT, which leverages the parent knowledge throughout the entire developing process of the child model. Our approach includes a parent-child representation alignment method, which ensures consistency in the output representations between the two models, and a child-aware datastore construction method that improves inference efficiency by selectively distilling the parent datastore based on relevance to the child model. Experimental results on four low-resource translation tasks show that kNN-TL outperforms strong baselines. Extensive analyses further demonstrate the effectiveness of our approach. Code and scripts are freely available at https://github.com/NLP2CT/kNN-TL.",
+    "authors": [
+      "Shudong Liu",
+      "Xuebo Liu",
+      "Derek F. Wong",
+      "Zhaocong Li",
+      "Wenxiang Jiao",
+      "Lidia S. Chao",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.105",
+    "point2d": [
+      -65.39956665039062,
+      -13.048325538635254
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 107,
+    "title": "Do language models have coherent mental models of everyday things?",
+    "abstract": "When people think of everyday things like an egg, they typically have a mental image associated with it. This allows them to correctly judge, for example, that \u201cthe yolk surrounds the shell\u201d is a false statement. Do language models similarly have a coherent picture of such everyday things? To investigate this, we propose a benchmark dataset consisting of 100 everyday things, their parts, and the relationships between these parts, expressed as 11,720 \u201cX relation Y?\u201d true/false questions. Using these questions as probes, we observe that state-of-the-art pre-trained language models (LMs) like GPT-3 and Macaw have fragments of knowledge about these everyday things, but do not have fully coherent \u201cparts mental models\u201d (54-59% accurate, 19-43% conditional constraint violation). We propose an extension where we add a constraint satisfaction layer on top of the LM\u2019s raw predictions to apply commonsense constraints. As well as removing inconsistencies, we find that this also significantly improves accuracy (by 16-20%), suggesting how the incoherence of the LM\u2019s pictures of everyday things can be significantly reduced.",
+    "authors": [
+      "Yuling Gu",
+      "Bhavana Dalvi Mishra",
+      "Peter Clark"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.106",
+    "point2d": [
+      39.70292663574219,
+      -6.8234477043151855
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 108,
+    "title": "Rogue Scores",
+    "abstract": "Correct, comparable, and reproducible model evaluation is essential for progress in machine learning. Over twenty years, thousands of language and vision models have been evaluated with a popular metric called ROUGE. Does this widespread benchmark metric meet these three evaluation criteria? This systematic review of over two thousand publications using ROUGE finds: (A) Critical evaluation decisions and parameters are routinely omitted, making most reported scores irreproducible. (B) Differences in evaluation protocol are common, affect scores, and impact the comparability of results reported in many papers. (C) Thousands of papers use nonstandard evaluation packages with software defects that produce provably incorrect scores. Estimating the overall impact of these findings is difficult: because software citations are rare, it is nearly impossible to distinguish between correct ROUGE scores and incorrect \u201crogue scores.\u201d",
+    "authors": [
+      "Max Grusky"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.107",
+    "point2d": [
+      -4.451081275939941,
+      1.1349831819534302
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 109,
+    "title": "Instruction Induction: From Few Examples to Natural Language Task Descriptions",
+    "abstract": "Large language models are able to perform a task by conditioning on a few input-output demonstrations - a paradigm known as in-context learning. We show that language models can explicitly infer an underlying task from a few demonstrations by prompting them to generate a natural language instruction that fits the examples. To explore this ability, we introduce the instruction induction challenge, compile a dataset consisting of 24 tasks, and define a novel evaluation metric based on executing the generated instruction. We discover that, to a large extent, the ability to generate instructions does indeed emerge when using a model that is both large enough and aligned to follow instructions; InstructGPT achieves 65.7% of human performance in our execution-based metric, while the original GPT-3 model reaches only 9.8% of human performance. This surprising result suggests that instruction induction might be a viable learning paradigm in and of itself, where instead of fitting a set of latent continuous parameters to the data, one searches for the best description in the natural language hypothesis space.",
+    "authors": [
+      "Or Honovich",
+      "Uri Shaham",
+      "Samuel R. Bowman",
+      "Omer Levy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.108",
+    "point2d": [
+      -18.894933700561523,
+      -20.309961318969727
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 110,
+    "title": "In-Context Analogical Reasoning with Pre-Trained Language Models",
+    "abstract": "Analogical reasoning is a fundamental capacity of human cognition that allows us to reason abstractly about novel situations by relating them to past experiences. While it is thought to be essential for robust reasoning in AI systems, conventional approaches require significant training and/or hard-coding of domain knowledge to be applied to benchmark tasks. Inspired by cognitive science research that has found connections between human language and analogy-making, we explore the use of intuitive language-based abstractions to support analogy in AI systems. Specifically, we apply large pre-trained language models (PLMs) to visual Raven\u2019s Progressive Matrices (RPM), a common relational reasoning test. By simply encoding the perceptual features of the problem into language form, we find that PLMs exhibit a striking capacity for zero-shot relational reasoning, exceeding human performance and nearing supervised vision-based methods. We explore different encodings that vary the level of abstraction over task features, finding that higher-level abstractions further strengthen PLMs\u2019 analogical reasoning. Our detailed analysis reveals insights on the role of model complexity, in-context learning, and prior knowledge in solving RPM tasks.",
+    "authors": [
+      "Xiaoyang Hu",
+      "Shane Storks",
+      "Richard Lewis",
+      "Joyce Chai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.109",
+    "point2d": [
+      41.22701644897461,
+      -15.284869194030762
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 111,
+    "title": "Peek Across: Improving Multi-Document Modeling via Cross-Document Question-Answering",
+    "abstract": "The integration of multi-document pre-training objectives into language models has resulted in remarkable improvements in multi-document downstream tasks. In this work, we propose extending this idea by pre-training a generic multi-document model from a novel cross-document question answering pre-training objective.To that end, given a set (or cluster) of topically-related documents, we systematically generate semantically-oriented questions from a salient sentence in one document and challenge the model, during pre-training, to answer these questions while \u201cpeeking\u201d into other topically-related documents.In a similar manner, the model is also challenged to recover the sentence from which the question was generated, again while leveraging cross-document information.This novel multi-document QA formulation directs the model to better recover cross-text informational relations, and introduces a natural augmentation that artificially increases the pre-training data. Further, unlike prior multi-document models that focus on either classification or summarization tasks, our pre-training objective formulation enables the model to perform tasks that involve both short text generation (e.g., QA) and long text generation (e.g., summarization).Following this scheme, we pre-train our model - termed QAmden - and evaluate its performance across several multi-document tasks, including multi-document QA, summarization, and query-focused summarization, yielding improvements of up to 7%, and significantly outperforms zero-shot GPT-3.5 and GPT-4.",
+    "authors": [
+      "Avi Caciularu",
+      "Matthew Peters",
+      "Jacob Goldberger",
+      "Ido Dagan",
+      "Arman Cohan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.110",
+    "point2d": [
+      56.5660285949707,
+      15.798768043518066
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 112,
+    "title": "Tailoring Instructions to Student\u2019s Learning Levels Boosts Knowledge Distillation",
+    "abstract": "It has been commonly observed that a teacher model with superior performance does not necessarily result in a stronger student, highlighting a discrepancy between current teacher training practices and effective knowledge transfer. In order to enhance the guidance of the teacher training process, we introduce the concept of distillation influence to determine the impact of distillation from each training sample on the student\u2019s generalization ability. In this paper, we propose Learning Good Teacher Matters (LGTM), an efficient training technique for incorporating distillation influence into the teacher\u2019s learning process. By prioritizing samples that are likely to enhance the student\u2019s generalization ability, our LGTM outperforms 10 common knowledge distillation baselines on 6 text classification tasks in the GLUE benchmark.",
+    "authors": [
+      "Yuxin Ren",
+      "Zihan Zhong",
+      "Xingjian Shi",
+      "Yi Zhu",
+      "Chun Yuan",
+      "Mu Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.111",
+    "point2d": [
+      -49.04058074951172,
+      -23.166587829589844
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 113,
+    "title": "REV: Information-Theoretic Evaluation of Free-Text Rationales",
+    "abstract": "Generating free-text rationales is a promising step towards explainable NLP, yet evaluating such rationales remains a challenge. Existing metrics have mostly focused on measuring the association between the rationale and a given label. We argue that an ideal metric should focus on the new information uniquely provided in the rationale that is otherwise not provided in the input or the label. We investigate this research problem from an information-theoretic perspective using conditional V-information (Hewitt et al., 2021). More concretely, we propose a metric called REV (Rationale Evaluation with conditional V-information), to quantify the amount of new, label-relevant information in a rationale beyond the information already available in the input or the label. Experiments across four benchmarks with reasoning tasks, including chain-of-thought, demonstrate the effectiveness of REV in evaluating rationale-label pairs, compared to existing metrics. We further demonstrate REV is consistent with human judgments on rationale evaluations and provides more sensitive measurements of new information in free-text rationales. When used alongside traditional performance metrics, REV provides deeper insights into models\u2019 reasoning and prediction processes.",
+    "authors": [
+      "Hanjie Chen",
+      "Faeze Brahman",
+      "Xiang Ren",
+      "Yangfeng Ji",
+      "Yejin Choi",
+      "Swabha Swayamdipta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.112",
+    "point2d": [
+      31.716955184936523,
+      -6.943453311920166
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 114,
+    "title": "ELQA: A Corpus of Metalinguistic Questions and Answers about English",
+    "abstract": "We present ELQA, a corpus of questions and answers in and about the English language. Collected from two online forums, the >70k questions (from English learners and others) cover wide-ranging topics including grammar, meaning, fluency, and etymology. The answers include descriptions of general properties of English vocabulary and grammar as well as explanations about specific (correct and incorrect) usage examples. Unlike most NLP datasets, this corpus is metalinguistic\u2014it consists of language about language. As such, it can facilitate investigations of the metalinguistic capabilities of NLU models, as well as educational applications in the language learning domain. To study this, we define a free-form question answering task on our dataset and conduct evaluations on multiple LLMs (Large Language Models) to analyze their capacity to generate metalinguistic answers.",
+    "authors": [
+      "Shabnam Behzad",
+      "Keisuke Sakaguchi",
+      "Nathan Schneider",
+      "Amir Zeldes"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.113",
+    "point2d": [
+      69.109130859375,
+      15.592232704162598
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 115,
+    "title": "Divide, Conquer, and Combine: Mixture of Semantic-Independent Experts for Zero-Shot Dialogue State Tracking",
+    "abstract": "Zero-shot transfer learning for Dialogue State Tracking (DST) helps to handle a variety of task-oriented dialogue domains without the cost of collecting in-domain data. Existing works mainly study common data- or model-level augmentation methods to enhance the generalization but fail to effectively decouple semantics of samples, limiting the zero-shot performance of DST. In this paper, we present a simple and effective \u201cdivide, conquer and combine\u201d solution, which explicitly disentangles the semantics of seen data, and leverages the performance and robustness with the mixture-of-experts mechanism. Specifically, we divide the seen data into semantically independent subsets and train corresponding experts, the newly unseen samples are mapped and inferred with mixture-of-experts with our designed ensemble inference.Extensive experiments on MultiWOZ2.1 upon T5-Adapter show our schema significantly and consistently improves the zero-shot performance, achieving the SOTA on settings without external knowledge, with only 10M trainable parameters.",
+    "authors": [
+      "Qingyue Wang",
+      "Liang Ding",
+      "Yanan Cao",
+      "Yibing Zhan",
+      "Zheng Lin",
+      "Shi Wang",
+      "Dacheng Tao",
+      "Li Guo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.114",
+    "point2d": [
+      2.521076202392578,
+      71.82329559326172
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 116,
+    "title": "BIG-C: a Multimodal Multi-Purpose Dataset for Bemba",
+    "abstract": "We present BIG-C (Bemba Image Grounded Conversations), a large multimodal dataset for Bemba. While Bemba is the most populous language of Zambia, it exhibits a dearth of resources which render the development of language technologies or language processing research almost impossible. The dataset is comprised of multi-turn dialogues between Bemba speakers based on images, transcribed and translated into English. There are more than 92,000 utterances/sentences, amounting to more than 180 hours of audio data with corresponding transcriptions and English translations. We also provide baselines on speech recognition (ASR), machine translation (MT) and speech translation (ST) tasks, and sketch out other potential future multimodal uses of our dataset. We hope that by making the dataset available to the research community, this work will foster research and encourage collaboration across the language, speech, and vision communities especially for languages outside the \u201ctraditionally\u201d used high-resourced ones. All data and code are publicly available: [https://github.com/csikasote/bigc](https://github.com/csikasote/bigc).",
+    "authors": [
+      "Claytone Sikasote",
+      "Eunice Mukonde",
+      "Md Mahfuz Ibn Alam",
+      "Antonios Anastasopoulos"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.115",
+    "point2d": [
+      -70.30069732666016,
+      24.339340209960938
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 117,
+    "title": "Schema-Guided User Satisfaction Modeling for Task-Oriented Dialogues",
+    "abstract": "User Satisfaction Modeling (USM) is one of the popular choices for task-oriented dialogue systems evaluation, where user satisfaction typically depends on whether the user\u2019s task goals were fulfilled by the system. Task-oriented dialogue systems use task schema, which is a set of task attributes, to encode the user\u2019s task goals. Existing studies on USM neglect explicitly modeling the user\u2019s task goals fulfillment using the task schema. In this paper, we propose SG-USM, a novel schema-guided user satisfaction modeling framework. It explicitly models the degree to which the user\u2019s preferences regarding the task attributes are fulfilled by the system for predicting the user\u2019s satisfaction level. SG-USM employs a pre-trained language model for encoding dialogue context and task attributes. Further, it employs a fulfillment representation layer for learning how many task attributes have been fulfilled in the dialogue, an importance predictor component for calculating the importance of task attributes. Finally, it predicts the user satisfaction based on task attribute fulfillment and task attribute importance. Experimental results on benchmark datasets (i.e. MWOZ, SGD, ReDial, and JDDC) show that SG-USM consistently outperforms competitive existing methods. Our extensive analysis demonstrates that SG-USM can improve the interpretability of user satisfaction modeling, has good scalability as it can effectively deal with unseen tasks and can also effectively work in low-resource settings by leveraging unlabeled data.Code is available at https://github.com/amzn/user-satisfaction-modeling.",
+    "authors": [
+      "Yue Feng",
+      "Yunlong Jiao",
+      "Animesh Prasad",
+      "Nikolaos Aletras",
+      "Emine Yilmaz",
+      "Gabriella Kazai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.116",
+    "point2d": [
+      18.16903305053711,
+      73.40364837646484
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 118,
+    "title": "Robust Multi-bit Natural Language Watermarking through Invariant Features",
+    "abstract": "Recent years have witnessed a proliferation of valuable original natural language contents found in subscription-based media outlets, web novel platforms, and outputs of large language models. However, these contents are susceptible to illegal piracy and potential misuse without proper security measures. This calls for a secure watermarking system to guarantee copyright protection through leakage tracing or ownership identification. To effectively combat piracy and protect copyrights, a multi-bit watermarking framework should be able to embed adequate bits of information and extract the watermarks in a robust manner despite possible corruption. In this work, we explore ways to advance both payload and robustness by following a well-known proposition from image watermarking and identify features in natural language that are invariant to minor corruption. Through a systematic analysis of the possible sources of errors, we further propose a corruption-resistant infill model. Our full method improves upon the previous work on robustness by +16.8% point on average on four datasets, three corruption types, and two corruption ratios",
+    "authors": [
+      "KiYoon Yoo",
+      "Wonhyuk Ahn",
+      "Jiho Jang",
+      "Nojun Kwak"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.117",
+    "point2d": [
+      -7.291734218597412,
+      12.669325828552246
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 119,
+    "title": "KALM: Knowledge-Aware Integration of Local, Document, and Global Contexts for Long Document Understanding",
+    "abstract": "With the advent of pre-trained language models (LMs), increasing research efforts have been focusing on infusing commonsense and domain-specific knowledge to prepare LMs for downstream tasks. These works attempt to leverage knowledge graphs, the de facto standard of symbolic knowledge representation, along with pre-trained LMs. While existing approaches leverage external knowledge, it remains an open question how to jointly incorporate knowledge graphs represented in varying contexts \u2014 from local (e.g., sentence), document-level, to global knowledge, to enable knowledge-rich and interpretable exchange across contexts. In addition, incorporating varying contexts can especially benefit long document understanding tasks that leverage pre-trained LMs, typically bounded by the input sequence length. In light of these challenges, we propose KALM, a language model that jointly leverages knowledge in local, document-level, and global contexts for long document understanding. KALM firstly encodes long documents and knowledge graphs into the three knowledge-aware context representations. KALM then processes each context with context-specific layers. These context-specific layers are followed by a ContextFusion layer that facilitates knowledge exchange to derive an overarching document representation. Extensive experiments demonstrate that KALM achieves state-of-the-art performance on three long document understanding tasks across 6 datasets/settings. Further analyses reveal that the three knowledge-aware contexts are complementary and they all contribute to model performance, while the importance and information exchange patterns of different contexts vary on different tasks and datasets.",
+    "authors": [
+      "Shangbin Feng",
+      "Zhaoxuan Tan",
+      "Wenqian Zhang",
+      "Zhenyu Lei",
+      "Yulia Tsvetkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.118",
+    "point2d": [
+      -21.678386688232422,
+      -28.34775161743164
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 120,
+    "title": "AtTGen: Attribute Tree Generation for Real-World Attribute Joint Extraction",
+    "abstract": "Attribute extraction aims to identify attribute names and the corresponding values from descriptive texts, which is the foundation for extensive downstream applications such as knowledge graph construction, search engines, and e-Commerce. In previous studies, attribute extraction is generally treated as a classification problem for predicting attribute types or a sequence tagging problem for labeling attribute values, where two paradigms, i.e., closed-world and open-world assumption, are involved. However, both of these paradigms have limitations in terms of real-world applications. And prior studies attempting to integrate these paradigms through ensemble, pipeline, and co-training models, still face challenges like cascading errors, high computational overhead, and difficulty in training. To address these existing problems, this paper presents Attribute Tree, a unified formulation for real-world attribute extraction application, where closed-world, open-world, and semi-open attribute extraction tasks are modeled uniformly. Then a text-to-tree generation model, AtTGen, is proposed to learn annotations from different scenarios efficiently and consistently. Experiments demonstrate that our proposed paradigm well covers various scenarios for real-world applications, and the model achieves state-of-the-art, outperforming existing methods by a large margin on three datasets. Our code, pretrained model, and datasets are available at https://github.com/lsvih/AtTGen.",
+    "authors": [
+      "Yanzeng Li",
+      "Bingcong Xue",
+      "Ruoyu Zhang",
+      "Lei Zou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.119",
+    "point2d": [
+      28.50520133972168,
+      -60.931278228759766
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 121,
+    "title": "Extractive is not Faithful: An Investigation of Broad Unfaithfulness Problems in Extractive Summarization",
+    "abstract": "The problems of unfaithful summaries have been widely discussed under the context of abstractive summarization. Though extractive summarization is less prone to the common unfaithfulness issues of abstractive summaries, does that mean extractive is equal to faithful? Turns out that the answer is no. In this work, we define a typology with five types of broad unfaithfulness problems (including and beyond not-entailment) that can appear in extractive summaries, including incorrect coreference, incomplete coreference, incorrect discourse, incomplete discourse, as well as other misleading information. We ask humans to label these problems out of 1600 English summaries produced by 16 diverse extractive systems. We find that 30% of the summaries have at least one of the five issues. To automatically detect these problems, we find that 5 existing faithfulness evaluation metrics for summarization have poor correlations with human judgment. To remedy this, we propose a new metric, ExtEval, that is designed for detecting unfaithful extractive summaries and is shown to have the best performance. We hope our work can increase the awareness of unfaithfulness problems in extractive summarization and help future work to evaluate and resolve these issues.",
+    "authors": [
+      "Shiyue Zhang",
+      "David Wan",
+      "Mohit Bansal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.120",
+    "point2d": [
+      -7.505527496337891,
+      46.664833068847656
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 122,
+    "title": "Improving Translation Quality Estimation with Bias Mitigation",
+    "abstract": "State-of-the-art translation Quality Estimation (QE) models are proven to be biased. More specifically, they over-rely on monolingual features while ignoring the bilingual semantic alignment. In this work, we propose a novel method to mitigate the bias of the QE model and improve estimation performance. Our method is based on the contrastive learning between clean and noisy sentence pairs. We first introduce noise to the target side of the parallel sentence pair, forming the negative samples. With the original parallel pairs as the positive sample, the QE model is contrastively trained to distinguish the positive samples from the negative ones. This objective is jointly trained with the regression-style quality estimation, so as to prevent the QE model from overfitting to monolingual features. Experiments on WMT QE evaluation datasets demonstrate that our method improves the estimation performance by a large margin while mitigating the bias.",
+    "authors": [
+      "Hui Huang",
+      "Shuangzhi Wu",
+      "Kehai Chen",
+      "Hui Di",
+      "Muyun Yang",
+      "Tiejun Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.121",
+    "point2d": [
+      -74.5358657836914,
+      -6.658355236053467
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 123,
+    "title": "Breeding Machine Translations: Evolutionary approach to survive and thrive in the world of automated evaluation",
+    "abstract": "We propose a genetic algorithm (GA) based method for modifying n-best lists produced by a machine translation (MT) system. Our method offers an innovative approach to improving MT quality and identifying weaknesses in evaluation metrics. Using common GA operations (mutation and crossover) on a list of hypotheses in combination with a fitness function (an arbitrary MT metric), we obtain novel and diverse outputs with high metric scores. With a combination of multiple MT metrics as the fitness function, the proposed method leads to an increase in translation quality as measured by other held-out automatic metrics.With a single metric (including popular ones such as COMET) as the fitness function, we find blind spots and flaws in the metric. This allows for an automated search for adversarial examples in an arbitrary metric, without prior assumptions on the form of such example. As a demonstration of the method, we create datasets of adversarial examples and use them to show that reference-free COMET is substantially less robust than the reference-based version.",
+    "authors": [
+      "Josef Jon",
+      "Ond\u0159ej Bojar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.122",
+    "point2d": [
+      -72.57929992675781,
+      -5.211593151092529
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 124,
+    "title": "MoralDial: A Framework to Train and Evaluate Moral Dialogue Systems via Moral Discussions",
+    "abstract": "Morality in dialogue systems has raised great attention in research recently. A moral dialogue system aligned with users\u2019 values could enhance conversation engagement and user connections. In this paper, we propose a framework, MoralDial to train and evaluate moral dialogue systems. In our framework, we first explore the communication mechanisms of morality and resolve expressed morality into three parts, which indicate the roadmap for building a moral dialogue system. Based on that, we design a simple yet effective method: constructing moral discussions between simulated specific users and the dialogue system. The constructed discussions consist of expressing, explaining, revising, and inferring moral views in dialogue exchanges, which makes conversational models learn morality well in a natural manner. Furthermore, we propose a novel evaluation method under the framework. We evaluate the multiple aspects of morality by judging the relation between dialogue responses and human values in discussions, where the multifaceted nature of morality is particularly considered. Automatic and manual experiments demonstrate that our framework is promising to train and evaluate moral dialogue systems.",
+    "authors": [
+      "Hao Sun",
+      "Zhexin Zhang",
+      "Fei Mi",
+      "Yasheng Wang",
+      "Wei Liu",
+      "Jianwei Cui",
+      "Bin Wang",
+      "Qun Liu",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.123",
+    "point2d": [
+      32.727325439453125,
+      43.379150390625
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 125,
+    "title": "Denoising Bottleneck with Mutual Information Maximization for Video Multimodal Fusion",
+    "abstract": "Video multimodal fusion aims to integrate multimodal signals in videos, such as visual, audio and text, to make a complementary prediction with multiple modalities contents.However, unlike other image-text multimodal tasks, video has longer multimodal sequences with more redundancy and noise in both visual and audio modalities.Prior denoising methods like forget gate are coarse in the granularity of noise filtering. They often suppress the redundant and noisy information at the risk of losing critical information.Therefore, we propose a denoising bottleneck fusion (DBF) model for fine-grained video multimodal fusion. On the one hand, we employ a bottleneck mechanism to filter out noise and redundancy with a restrained receptive field. On the other hand, we use a mutual information maximization module to regulate the filter-out module to preserve key information within different modalities.Our DBF model achieves significant improvement over current state-of-the-art baselines on multiple benchmarks covering multimodal sentiment analysis and multimodal summarization tasks. It proves that our model can effectively capture salient features from noisy and redundant video, audio, and text inputs.The code for this paper will be publicly available at https://github.com/WSXRHFG/DBF",
+    "authors": [
+      "Shaoxiang Wu",
+      "Damai Dai",
+      "Ziwei Qin",
+      "Tianyu Liu",
+      "Binghuai Lin",
+      "Yunbo Cao",
+      "Zhifang Sui"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.124",
+    "point2d": [
+      -40.880672454833984,
+      56.3044319152832
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 126,
+    "title": "SimLM: Pre-training with Representation Bottleneck for Dense Passage Retrieval",
+    "abstract": "In this paper, we propose SimLM (Similarity matching with Language Model pre-training), a simple yet effective pre-training method for dense passage retrieval. It employs a simple bottleneck architecture that learns to compress the passage information into a dense vector through self-supervised pre-training. We use a replaced language modeling objective, which is inspired by ELECTRA (Clark et al., 2020), to improve the sample efficiency and reduce the mismatch of the input distribution between pre-training and fine-tuning. SimLM only requires access to an unlabeled corpus and is more broadly applicable when there are no labeled data or queries. We conduct experiments on several large-scale passage retrieval datasets and show substantial improvements over strong baselines under various settings. Remarkably, SimLM even outperforms multi-vector approaches such as ColBERTv2 (Santhanam et al., 2021) which incurs significantly more storage cost. Our code and model checkpoints are available at https://github.com/microsoft/unilm/tree/master/simlm .",
+    "authors": [
+      "Liang Wang",
+      "Nan Yang",
+      "Xiaolong Huang",
+      "Binxing Jiao",
+      "Linjun Yang",
+      "Daxin Jiang",
+      "Rangan Majumder",
+      "Furu Wei"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.125",
+    "point2d": [
+      14.091294288635254,
+      -14.464376449584961
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 127,
+    "title": "From Ultra-Fine to Fine: Fine-tuning Ultra-Fine Entity Typing Models to Fine-grained",
+    "abstract": "For the task of fine-grained entity typing (FET), due to the use of a large number of entity types, it is usually considered too costly to manually annotating a training dataset that contains an ample number of examples for each type. A common way to address this problem is to use distantly annotated training data that contains incorrect labels. However, the performance of models trained solely with such data can be limited by the errors in the automatic annotation. Recently, there are a few approaches that no longer follow this conventional way. But without using sufficient direct entity typing supervision may also cause them to yield inferior performance. In this paper, we propose a new approach that can avoid the need of creating distantly labeled data whenever there is a new type schema. We first train an entity typing model that have an extremely board type coverage by using the ultra-fine entity typing data. Then, when there is a need to produce a model for a newly designed fine-grained entity type schema. We can simply fine-tune the previously trained model with a small number of examples annotated under this schema. Experimental results show that our approach achieves outstanding performance for FET under the few-shot setting. It can also outperform state-of-the-art weak supervision based methods after fine-tuning the model with only a small size manually annotated training set.",
+    "authors": [
+      "Hongliang Dai",
+      "Ziqian Zeng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.126",
+    "point2d": [
+      40.09565353393555,
+      -83.0239028930664
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 128,
+    "title": "Controlling Learned Effects to Reduce Spurious Correlations in Text Classifiers",
+    "abstract": "To address the problem of NLP classifiers learning spurious correlations between training features and target labels, a common approach is to make the model\u2019s predictions invariant to these features. However, this can be counter-productive when the features have a non-zero causal effect on the target label and thus are important for prediction. Therefore, using methods from the causal inference literature, we propose an algorithm to regularize the learnt effect of the features on the model\u2019s prediction to the estimated effect of feature on label. This results in an automated augmentation method that leverages the estimated effect of a feature to appropriately change the labels for new augmented inputs. On toxicity and IMDB review datasets, the proposed algorithm minimises spurious correlations and improves the minority group (i.e., samples breaking spurious correlations) accuracy, while also improving the total accuracy compared to standard training.",
+    "authors": [
+      "Parikshit Bansal",
+      "Amit Sharma"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.127",
+    "point2d": [
+      28.414255142211914,
+      -0.7491106390953064
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 129,
+    "title": "What Makes Pre-trained Language Models Better Zero-shot Learners?",
+    "abstract": "Current methods for prompt learning in zero-shot scenarios widely rely on a development set with sufficient human-annotated data to select the best-performing prompt template a posteriori. This is not ideal because in a real-world zero-shot scenario of practical relevance, no labelled data is available. Thus, we propose a simple yet effective method for screening reasonable prompt templates in zero-shot text classification: Perplexity Selection (Perplection). We hypothesize that language discrepancy can be used to measure the efficacy of prompt templates, and thereby develop a substantiated perplexity-based scheme allowing for forecasting the performance of prompt templates in advance. Experiments show that our method leads to improved prediction performance in a realistic zero-shot setting, eliminating the need for any labelled examples.",
+    "authors": [
+      "Jinghui Lu",
+      "Dongsheng Zhu",
+      "Weidong Han",
+      "Rui Zhao",
+      "Brian Mac Namee",
+      "Fei Tan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.128",
+    "point2d": [
+      -15.617977142333984,
+      -9.632047653198242
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 130,
+    "title": "Z-ICL: Zero-Shot In-Context Learning with Pseudo-Demonstrations",
+    "abstract": "Although large language models can be prompted for both zero- and few-shot learning, performance drops significantly when no demonstrations are available. In this paper, we introduce Z-ICL, a new zero-shot method that closes the gap by constructing pseudo-demonstrations for a given test input using a raw text corpus. Concretely, pseudo-demonstrations are constructed by (1) finding the nearest neighbors to the test input from the corpus and pairing them with random task labels, and (2) applying a set of techniques to reduce the amount of direct copying the model does from the resulting demonstrations. Evaluation on nine classification datasets shows that Z-ICL outperforms previous zero-shot methods by a significant margin, and is on par with in-context learning with labeled training data in the few-shot setting. Overall, Z-ICL provides a significantly higher estimate of the zero-shot performance levels of a model, and supports future efforts to develop better pseudo-demonstrations that further improve zero-shot results.",
+    "authors": [
+      "Xinxi Lyu",
+      "Sewon Min",
+      "Iz Beltagy",
+      "Luke Zettlemoyer",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.129",
+    "point2d": [
+      -12.82280158996582,
+      -16.263553619384766
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 131,
+    "title": "Learning Optimal Policy for Simultaneous Machine Translation via Binary Search",
+    "abstract": "Simultaneous machine translation (SiMT) starts to output translation while reading the source sentence and needs a precise policy to decide when to output the generated translation. Therefore, the policy determines the number of source tokens read during the translation of each target token. However, it is difficult to learn a precise translation policy to achieve good latency-quality trade-offs, because there is no golden policy corresponding to parallel sentences as explicit supervision. In this paper, we present a new method for constructing the optimal policy online via binary search. By employing explicit supervision, our approach enables the SiMT model to learn the optimal policy, which can guide the model in completing the translation during inference. Experiments on four translation tasks show that our method can exceed strong baselines across all latency scenarios.",
+    "authors": [
+      "Shoutao Guo",
+      "Shaolei Zhang",
+      "Yang Feng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.130",
+    "point2d": [
+      -62.13486099243164,
+      -5.4658942222595215
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 132,
+    "title": "Better Simultaneous Translation with Monotonic Knowledge Distillation",
+    "abstract": "Simultaneous machine translation (SiMT) presents a unique challenge as it requires generating target tokens before the source sentence is fully consumed. This can lead to the hallucination problem, where target tokens are generated without support from the source sentence. The prefix-to-prefix training data used to train SiMT models are not always parallel, due to divergent word order between the source and target languages, and can contribute to the problem. In this paper, we propose a novel approach that leverages traditional translation models as teachers and employs a two-stage beam search algorithm to generate monotonic yet accurate reference translations for sequence-level knowledge distillation. Experimental results demonstrate the significant improvements achieved by our approach over multiple strong SiMT baselines, leading to new state-of-the-art performance across various language pairs. Notably, when evaluated on a monotonic version of the WMT15 De-En test set, which includes references generated in a more monotonic style by professional translators, our approach achieves even more substantial improvement over the baselines. The source code and data are publicly available for further exploration.",
+    "authors": [
+      "Shushu Wang",
+      "Jing Wu",
+      "Kai Fan",
+      "Wei Luo",
+      "Jun Xiao",
+      "Zhongqiang Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.131",
+    "point2d": [
+      -64.0119857788086,
+      -5.3633222579956055
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 133,
+    "title": "StoryARG: a corpus of narratives and personal experiences in argumentative texts",
+    "abstract": "Humans are storytellers, even in communication scenarios which are assumed to be more rationality-oriented, such as argumentation. Indeed, supporting arguments with narratives or personal experiences (henceforth, stories) is a very natural thing to do \u2013 and yet, this phenomenon is largely unexplored in computational argumentation. Which role do stories play in an argument? Do they make the argument more effective? What are their narrative properties? To address these questions, we collected and annotated StoryARG, a dataset sampled from well-established corpora in computational argumentation (ChangeMyView and RegulationRoom), and the Social Sciences (Europolis), as well as comments to New York Times articles. StoryARG contains 2451 textual spans annotated at two levels. At the argumentative level, we annotate the function of the story (e.g., clarification, disclosure of harm, search for a solution, establishing speaker\u2019s authority), as well as its impact on the effectiveness of the argument and its emotional load. At the level of narrative properties, we annotate whether the story has a plot-like development, is factual or hypothetical, and who the protagonist is.What makes a story effective in an argument? Our analysis of the annotations in StoryARG uncover a positive impact on effectiveness for stories which illustrate a solution to a problem, and in general, annotator-specific preferences that we investigate with regression analysis.",
+    "authors": [
+      "Neele Falk",
+      "Gabriella Lapesa"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.132",
+    "point2d": [
+      39.73584747314453,
+      36.72584533691406
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 134,
+    "title": "Injecting knowledge into language generation: a case study in auto-charting after-visit care instructions from medical dialogue",
+    "abstract": "Factual correctness is often the limiting factor in practical applications of natural language generation in high-stakes domains such as healthcare. An essential requirement for maintaining factuality is the ability to deal with rare tokens. This paper focuses on rare tokens that appear in both the source and the reference sequences, and which, when missed during generation, decrease the factual correctness of the output text. For high-stake domains that are also knowledge-rich, we show how to use knowledge to (a) identify which rare tokens that appear in both source and reference are important and (b) uplift their conditional probability. We introduce the \u201cutilization rate\u201d that encodes knowledge and serves as a regularizer by maximizing the marginal probability of selected tokens. We present a study in a knowledge-rich domain of healthcare, where we tackle the problem of generating after-visit care instructions based on patient-doctor dialogues. We verify that, in our dataset, specific medical concepts with high utilization rates are underestimated by conventionally trained sequence-to-sequence models. We observe that correcting this with our approach to knowledge injection reduces the uncertainty of the model as well as improves factuality and coherence without negatively impacting fluency.",
+    "authors": [
+      "Maksim Eremeev",
+      "Ilya Valmianski",
+      "Xavier Amatriain",
+      "Anitha Kannan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.133",
+    "point2d": [
+      19.332918167114258,
+      -0.5225687623023987
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 135,
+    "title": "Sequence Parallelism: Long Sequence Training from System Perspective",
+    "abstract": "Transformer achieves promising results on various tasks. However, self-attention suffers from quadratic memory requirements with respect to the sequence length. Existing work focuses on reducing time and space complexity from an algorithm perspective. In this work, we propose sequence parallelism, a memory-efficient parallelism to solve this issue from system perspective instead. Our approach is compatible with most existing parallelisms (e.g., data, pipeline, and tensor parallelism), which means our sequence parallelism makes 4D parallelism possible. More importantly, we no longer require a single device to hold the whole sequence. Besides, using efficient attention with linear complexity, our sequence parallelism enables us to train transformer with infinite long sequence. Specifically, we split the input sequence into multiple chunks and feed each chunk into its corresponding device (i.e., GPU). To compute the attention output, we integrated ring-style communication with self-attention calculation and proposed Ring Self-Attention (RSA). Experiments show that sequence parallelism performs well when scaling with batch size and sequence length. Compared with tensor parallelism, our approach achieved 13.7\\times and 3.0\\times maximum batch size and sequence length respectively when scaling up to 64 NVIDIA P100 GPUs. With efficient attention, sequence can handle sequence with over 114K tokens, which is over 27\\times longer than existing efficient attention works holding the whole sequence on a single device.",
+    "authors": [
+      "Shenggui Li",
+      "Fuzhao Xue",
+      "Chaitanya Baranwal",
+      "Yongbin Li",
+      "Yang You"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.134",
+    "point2d": [
+      -35.92945861816406,
+      -28.77885627746582
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 136,
+    "title": "MUSTIE: Multimodal Structural Transformer for Web Information Extraction",
+    "abstract": "The task of web information extraction is to extract target fields of an object from web pages, such as extracting the name, genre and actor from a movie page. Recent sequential modeling approaches have achieved state-of-the-art results on web information extraction. However, most of these methods only focus on extracting information from textual sources while ignoring the rich information from other modalities such as image and web layout. In this work, we propose a novel MUltimodal Structural Transformer (MUST) that incorporates multiple modalities for web information extraction. Concretely, we develop a structural encoder that jointly encodes the multimodal information based on the HTML structure of the web layout, where high-level DOM nodes, and low-level text and image tokens are introduced to represent the entire page. Structural attention patterns are designed to learn effective cross-modal embeddings for all DOM nodes and low-level tokens. An extensive set of experiments are conducted on WebSRC and Common Crawl benchmarks. Experimental results demonstrate the superior performance of MUST over several state-of-the-art baselines.",
+    "authors": [
+      "Qifan Wang",
+      "Jingang Wang",
+      "Xiaojun Quan",
+      "Fuli Feng",
+      "Zenglin Xu",
+      "Shaoliang Nie",
+      "Sinong Wang",
+      "Madian Khabsa",
+      "Hamed Firooz",
+      "Dongfang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.135",
+    "point2d": [
+      29.142953872680664,
+      -57.47809982299805
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 137,
+    "title": "Augmentation-Adapted Retriever Improves Generalization of Language Models as Generic Plug-In",
+    "abstract": "Retrieval augmentation can aid language models (LMs) in knowledge-intensive tasks by supplying them with external information. Prior works on retrieval augmentation usually jointly fine-tune the retriever and the LM, making them closely coupled. In this paper, we explore the scheme of generic retrieval plug-in: the retriever is to assist target LMs that may not be known beforehand or are unable to be fine-tuned together. To retrieve useful documents for unseen target LMs, we propose augmentation-adapted retriever (AAR), which learns LM\u2019s preferences obtained from a known source LM. Experiments on the MMLU and PopQA datasets demonstrate that our AAR trained with a small source LM is able to significantly improve the zero-shot generalization of larger target LMs ranging from 250M Flan-T5 to 175B InstructGPT. Further analysis indicates that the preferences of different LMs overlap, enabling AAR trained with a single source LM to serve as a generic plug-in for various target LMs. Our code is open-sourced at https://github.com/OpenMatch/Augmentation-Adapted-Retriever.",
+    "authors": [
+      "Zichun Yu",
+      "Chenyan Xiong",
+      "Shi Yu",
+      "Zhiyuan Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.136",
+    "point2d": [
+      9.216465950012207,
+      -17.11669921875
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 138,
+    "title": "TableVLM: Multi-modal Pre-training for Table Structure Recognition",
+    "abstract": "Tables are widely used in research and business, which are suitable for human consumption, but not easily machine-processable, particularly when tables are present in images.One of the main challenges to extracting data from images of tables is accurately recognizing table structures, especially for complex tables with cross rows and columns.In this study, we propose a novel multi-modal pre-training model for table structure recognition, named TableVLM.With a two-stream multi-modal transformer-based encoder-decoder architecture, TableVLM learns to capture rich table structure-related features by multiple carefully-designed unsupervised objectives inspired by the notion of masked visual-language modeling.To pre-train this model, we also created a dataset, called ComplexTable, which consists of 1,000K samples to be released publicly. Experiment results show that the model built on pre-trained TableVLM can improve the performance up to 1.97% in tree-editing-distance-score on ComplexTable.",
+    "authors": [
+      "Leiyuan Chen",
+      "Chengsong Huang",
+      "Xiaoqing Zheng",
+      "Jinshu Lin",
+      "Xuanjing Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.137",
+    "point2d": [
+      79.76891326904297,
+      6.890159606933594
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 139,
+    "title": "Can NLI Provide Proper Indirect Supervision for Low-resource Biomedical Relation Extraction?",
+    "abstract": "Two key obstacles in biomedical relation extraction (RE) are the scarcity of annotations and the prevalence of instances without explicitly pre-defined labels due to low annotation coverage. Existing approaches, which treat biomedical RE as a multi-class classification task, often result in poor generalization in low-resource settings and do not have the ability to make selective prediction on unknown cases but give a guess from seen relations, hindering the applicability of those approaches. We present NBR, which converts biomedical RE as natural language inference formulation through indirect supervision. By converting relations to natural language hypotheses, NBR is capable of exploiting semantic cues to alleviate annotation scarcity. By incorporating a ranking-based loss that implicitly calibrates abstinent instances, NBR learns a clearer decision boundary and is instructed to abstain on uncertain instances. Extensive experiments on three widely-used biomedical RE benchmarks, namely ChemProt, DDI and GAD, verify the effectiveness of NBR in both full-set and low-resource regimes. Our analysis demonstrates that indirect supervision benefits biomedical RE even when a domain gap exists, and combining NLI knowledge with biomedical knowledge leads to the best performance gains.",
+    "authors": [
+      "Jiashu Xu",
+      "Mingyu Derek Ma",
+      "Muhao Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.138",
+    "point2d": [
+      35.42085266113281,
+      -43.55021667480469
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 140,
+    "title": "Dynamic Routing Transformer Network for Multimodal Sarcasm Detection",
+    "abstract": "Multimodal sarcasm detection is an important research topic in natural language processing and multimedia computing, and benefits a wide range of applications in multiple domains. Most existing studies regard the incongruity between image and text as the indicative clue in identifying multimodal sarcasm. To capture cross-modal incongruity, previous methods rely on fixed architectures in network design, which restricts the model from dynamically adjusting to diverse image-text pairs. Inspired by routing-based dynamic network, we model the dynamic mechanism in multimodal sarcasm detection and propose the Dynamic Routing Transformer Network (DynRT-Net). Our method utilizes dynamic paths to activate different routing transformer modules with hierarchical co-attention adapting to cross-modal incongruity. Experimental results on a public dataset demonstrate the effectiveness of our method compared to the state-of-the-art methods. Our codes are available at https://github.com/TIAN-viola/DynRT.",
+    "authors": [
+      "Yuan Tian",
+      "Nan Xu",
+      "Ruike Zhang",
+      "Wenji Mao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.139",
+    "point2d": [
+      -24.868488311767578,
+      61.95337677001953
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 141,
+    "title": "What Are You Token About? Dense Retrieval as Distributions Over the Vocabulary",
+    "abstract": "Dual encoders are now the dominant architecture for dense retrieval. Yet, we have little understanding of how they represent text, and why this leads to good performance. In this work, we shed light on this question via distributions over the vocabulary. We propose to interpret the vector representations produced by dual encoders by projecting them into the model\u2019s vocabulary space. We show that the resulting projections contain rich semantic information, and draw connection between them and sparse retrieval. We find that this view can offer an explanation for some of the failure cases of dense retrievers. For example, we observe that the inability of models to handle tail entities is correlated with a tendency of the token distributions to forget some of the tokens of those entities. We leverage this insight and propose a simple way to enrich query and passage representations with lexical information at inference time, and show that this significantly improves performance compared to the original model in zero-shot settings, and specifically on the BEIR benchmark.",
+    "authors": [
+      "Ori Ram",
+      "Liat Bezalel",
+      "Adi Zicher",
+      "Yonatan Belinkov",
+      "Jonathan Berant",
+      "Amir Globerson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.140",
+    "point2d": [
+      12.78380012512207,
+      -13.286795616149902
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 142,
+    "title": "Cold-Start Data Selection for Better Few-shot Language Model Fine-tuning: A Prompt-based Uncertainty Propagation Approach",
+    "abstract": "We present PATRON, a prompt-based data selection method for pre-trained language model fine-tuning under cold-start scenarios, i.e., no initial labeled data are available. In PATRON, we design (1) a prompt-based uncertainty propagation approach to estimate the importance of data points and (2) a partition-then-rewrite (PTR) strategy to promote sample diversity when querying for annotations. Experiments on six text classification datasets show that PATRON outperforms the strongest cold-start data selection baselines by up to 6.9%. Besides, with 128 labels only, PATRON achieves 91.0% and 92.1% of the fully supervised performance based on vanilla fine-tuning and prompt-based learning respectively. Our implementation of PATRON will be published upon acceptance.",
+    "authors": [
+      "Yue Yu",
+      "Rongzhi Zhang",
+      "Ran Xu",
+      "Jieyu Zhang",
+      "Jiaming Shen",
+      "Chao Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.141",
+    "point2d": [
+      -16.52063751220703,
+      -6.085596561431885
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 143,
+    "title": "Training-free Neural Architecture Search for RNNs and Transformers",
+    "abstract": "Neural architecture search (NAS) has allowed for the automatic creation of new and effective neural network architectures, offering an alternative to the laborious process of manually designing complex architectures. However, traditional NAS algorithms are slow and require immense amounts of computing power. Recent research has investigated training-free NAS metrics for image classification architectures, drastically speeding up search algorithms. In this paper, we investigate training-free NAS metrics for recurrent neural network (RNN) and BERT-based transformer architectures, targeted towards language modeling tasks. First, we develop a new training-free metric, named hidden covariance, that predicts the trained performance of an RNN architecture and significantly outperforms existing training-free metrics. We experimentally evaluate the effectiveness of the hidden covariance metric on the NAS-Bench-NLP benchmark. Second, we find that the current search space paradigm for transformer architectures is not optimized for training-free neural architecture search. Instead, a simple qualitative analysis can effectively shrink the search space to the best performing architectures. This conclusion is based on our investigation of existing training-free metrics and new metrics developed from recent transformer pruning literature, evaluated on our own benchmark of trained BERT architectures. Ultimately, our analysis shows that the architecture search space and the training-free metric must be developed together in order to achieve effective results. Our source code is available at https://github.com/aaronserianni/training-free-nas.",
+    "authors": [
+      "Aaron Serianni",
+      "Jugal Kalita"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.142",
+    "point2d": [
+      -41.91786575317383,
+      -25.490467071533203
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 144,
+    "title": "CrossSum: Beyond English-Centric Cross-Lingual Summarization for 1,500+ Language Pairs",
+    "abstract": "We present CrossSum, a large-scale cross-lingual summarization dataset comprising 1.68 million article-summary samples in 1,500+ language pairs. We create CrossSum by aligning parallel articles written in different languages via cross-lingual retrieval from a multilingual abstractive summarization dataset and perform a controlled human evaluation to validate its quality. We propose a multistage data sampling algorithm to effectively train a cross-lingual summarization model capable of summarizing an article in any target language. We also introduce LaSE, an embedding-based metric for automatically evaluating model-generated summaries. LaSE is strongly correlated with ROUGE and, unlike ROUGE, can be reliably measured even in the absence of references in the target language. Performance on ROUGE and LaSE indicate that our proposed model consistently outperforms baseline models. To the best of our knowledge, CrossSum is the largest cross-lingual summarization dataset and the first ever that is not centered around English. We are releasing the dataset, training and evaluation scripts, and models to spur future research on cross-lingual summarization. The resources can be found at https://github.com/csebuetnlp/CrossSum",
+    "authors": [
+      "Abhik Bhattacharjee",
+      "Tahmid Hasan",
+      "Wasi Uddin Ahmad",
+      "Yuan-Fang Li",
+      "Yong-Bin Kang",
+      "Rifat Shahriyar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.143",
+    "point2d": [
+      -11.505590438842773,
+      39.7391242980957
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 145,
+    "title": "Improving Gradient Trade-offs between Tasks in Multi-task Text Classification",
+    "abstract": "Multi-task learning (MTL) has emerged as a promising approach for sharing inductive bias across multiple tasks to enable more efficient learning in text classification. However, training all tasks simultaneously often yields degraded performance of each task than learning them independently, since different tasks might conflict with each other.Existing MTL methods for alleviating this issue is to leverage heuristics or gradient-based algorithm to achieve an arbitrary Pareto optimal trade-off among different tasks. In this paper, we present a novel gradient trade-off approach to mitigate the task conflict problem, dubbed GetMTL, which can achieve a specific trade-off among different tasks nearby the main objective of multi-task text classification (MTC), so as to improve the performance of each task simultaneously.The results of extensive experiments on two benchmark datasets back up our theoretical analysis and validate the superiority of our proposed GetMTL.",
+    "authors": [
+      "Heyan Chai",
+      "Jinhao Cui",
+      "Ye Wang",
+      "Min Zhang",
+      "Binxing Fang",
+      "Qing Liao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.144",
+    "point2d": [
+      -6.576905727386475,
+      -21.551105499267578
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 146,
+    "title": "Bi-Phone: Modeling Inter Language Phonetic Influences in Text",
+    "abstract": "A large number of people are forced to use the Web in a language they have low literacy in due to technology asymmetries. Written text in the second language (L2) from such users often contains a large number of errors that are influenced by their native language (L1).We propose a method to mine phoneme confusions (sounds in L2 that an L1 speaker is likely to conflate) for pairs of L1 and L2.These confusions are then plugged into a generative model (Bi-Phone) for synthetically producing corrupted L2 text.Through human evaluations, we show that Bi-Phone generates plausible corruptions that differ across L1s and also have widespread coverage on the Web.We also corrupt the popular language understanding benchmark SuperGLUE with our technique (FunGLUE for Phonetically Noised GLUE) and show that SoTA language understating models perform poorly.We also introduce a new phoneme prediction pre-training task which helps byte models to recover performance close to SuperGLUE. Finally, we also release the SuperGLUE benchmark to promote further research in phonetically robust language models. To the best of our knowledge, FunGLUE is the first benchmark to introduce L1-L2 interactions in text.",
+    "authors": [
+      "Abhirut Gupta",
+      "Ananya B. Sai",
+      "Richard Sproat",
+      "Yuri Vasilevski",
+      "James Ren",
+      "Ambarish Jash",
+      "Sukhdeep Sodhi",
+      "Aravindan Raghuveer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.145",
+    "point2d": [
+      -40.90016174316406,
+      4.634227752685547
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 147,
+    "title": "Cross2StrA: Unpaired Cross-lingual Image Captioning with Cross-lingual Cross-modal Structure-pivoted Alignment",
+    "abstract": "Unpaired cross-lingual image captioning has long suffered from irrelevancy and disfluency issues, due to the inconsistencies of the semantic scene and syntax attributes during transfer. In this work, we propose to address the above problems by incorporating the scene graph (SG) structures and the syntactic constituency (SC) trees. Our captioner contains the semantic structure-guided image-to-pivot captioning and the syntactic structure-guided pivot-to-target translation, two of which are joined via pivot language. We then take the SG and SC structures as pivoting, performing cross-modal semantic structure alignment and cross-lingual syntactic structure alignment learning. We further introduce cross-lingual&cross-modal back-translation training to fully align the captioning and translation stages. Experiments on English-Chinese transfers show that our model shows great superiority in improving captioning relevancy and fluency.",
+    "authors": [
+      "Shengqiong Wu",
+      "Hao Fei",
+      "Wei Ji",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.146",
+    "point2d": [
+      -65.44623565673828,
+      40.49787902832031
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 148,
+    "title": "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
+    "abstract": "Large language models (LLMs) have recently been shown to deliver impressive performance in various NLP tasks. To tackle multi-step reasoning tasks, Few-shot chain-of-thought (CoT) prompting includes a few manually crafted step-by-step reasoning demonstrations which enable LLMs to explicitly generate reasoning steps and improve their reasoning task accuracy. To eliminate the manual efforts, Zero-shot-CoT concatenates the target problem statement with \u201cLet\u2019s think step by step\u201d as an input prompt to LLMs. Despite the success of Zero-shot-CoT, it still suffers from three pitfalls: calculation errors, missing-step errors, and semantic misunderstanding errors. To address the missing-step errors, we propose Plan-and-Solve (PS) Prompting. It consists of two components: first, devising a plan to divide the entire task into smaller subtasks, and then carrying out the subtasks according to the plan. To address the calculation errors and improve the quality of generated reasoning steps, we extend PS prompting with more detailed instructions and derive PS+ prompting. We evaluate our proposed prompting strategy on ten datasets across three reasoning problems. The experimental results over GPT-3 show that our proposed zero-shot prompting consistently outperforms Zero-shot-CoT across all datasets by a large margin, is comparable to or exceeds Zero-shot-Program-of-Thought Prompting, and has comparable performance with 8-shot CoT prompting on the math reasoning problem. The code can be found at https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting.",
+    "authors": [
+      "Lei Wang",
+      "Wanyu Xu",
+      "Yihuai Lan",
+      "Zhiqiang Hu",
+      "Yunshi Lan",
+      "Roy Ka-Wei Lee",
+      "Ee-Peng Lim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.147",
+    "point2d": [
+      46.001888275146484,
+      -15.037023544311523
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 149,
+    "title": "RetroMAE-2: Duplex Masked Auto-Encoder For Pre-Training Retrieval-Oriented Language Models",
+    "abstract": "To better support information retrieval tasks such as web search and open-domain question answering, growing effort is made to develop retrieval-oriented language models, e.g., RetroMAE and many others. Most of the existing works focus on improving the semantic representation capability for the contextualized embedding of the [CLS] token. However, recent study shows that the ordinary tokens besides [CLS] may provide extra information, which help to produce a better representation effect. As such, it\u2019s necessary to extend the current methods where all contextualized embeddings can be jointly pre-trained for the retrieval tasks. In this work, we propose a novel pre-training method called Duplex Masked Auto-Encoder, a.k.a. DupMAE. It is designed to improve the quality of semantic representation where all contextualized embeddings of the pre-trained model can be leveraged. It takes advantage of two complementary auto-encoding tasks: one reconstructs the input sentence on top of the [CLS] embedding; the other one predicts the bag-of-words feature of the input sentence based on the ordinary tokens\u2019 embeddings. The two tasks are jointly conducted to train a unified encoder, where the whole contextualized embeddings are aggregated in a compact way to produce the final semantic representation. DupMAE is simple but empirically competitive: it substantially improves the pre-trained model\u2019s representation capability and transferability, where superior retrieval performances can be achieved on popular benchmarks, like MS MARCO and BEIR. We make our code publicly available at https://github.com/staoxiao/RetroMAE.",
+    "authors": [
+      "Zheng Liu",
+      "Shitao Xiao",
+      "Yingxia Shao",
+      "Zhao Cao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.148",
+    "point2d": [
+      10.666644096374512,
+      -14.62844467163086
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 150,
+    "title": "DecompX: Explaining Transformers Decisions by Propagating Token Decomposition",
+    "abstract": "An emerging solution for explaining Transformer-based models is to use vector-based analysis on how the representations are formed. However, providing a faithful vector-based explanation for a multi-layer model could be challenging in three aspects: (1) Incorporating all components into the analysis, (2) Aggregating the layer dynamics to determine the information flow and mixture throughout the entire model, and (3) Identifying the connection between the vector-based analysis and the model\u2019s predictions. In this paper, we present DecompX to tackle these challenges. DecompX is based on the construction of decomposed token representations and their successive propagation throughout the model without mixing them in between layers. Additionally, our proposal provides multiple advantages over existing solutions for its inclusion of all encoder components (especially nonlinear feed-forward networks) and the classification head. The former allows acquiring precise vectors while the latter transforms the decomposition into meaningful prediction-based values, eliminating the need for norm- or summation-based vector aggregation. According to the standard faithfulness evaluations, DecompX consistently outperforms existing gradient-based and vector-based approaches on various datasets.Our code is available at https://github.com/mohsenfayyaz/DecompX.",
+    "authors": [
+      "Ali Modarressi",
+      "Mohsen Fayyaz",
+      "Ehsan Aghazadeh",
+      "Yadollah Yaghoobzadeh",
+      "Mohammad Taher Pilehvar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.149",
+    "point2d": [
+      -43.35667037963867,
+      -31.728940963745117
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 151,
+    "title": "Symbolic Chain-of-Thought Distillation: Small Models Can Also \u201cThink\u201d Step-by-Step",
+    "abstract": "Chain-of-thought prompting (e.g., \u201cLet\u2019s think step-by-ste\u201d) primes large language models to verbalize rationalization for their predictions. While chain-of-thought can lead to dramatic performance gains, benefits appear to emerge only for sufficiently large models (beyond 50B parameters). We show that orders-of-magnitude smaller models (125M\u20141.3B parameters) can still benefit from chain-of-thought prompting. To achieve this, we introduce Symbolic Chain-of-Thought Distillation (SCoTD), a method to train a smaller student model on rationalizations sampled from a significantly larger teacher model. Experiments across several commonsense benchmarks show that: 1) SCoTD enhances the performance of the student model in both supervised and few-shot settings, and especially for challenge sets; 2) sampling many reasoning chains per instance from the teacher is paramount; and 3) after distillation, student chain-of-thoughts are judged by humans as comparable to the teacher, despite orders of magnitude fewer parameters. We test several hypotheses regarding what properties of chain-of-thought samples are important, e.g., diversity vs. teacher likelihood vs. open-endedness. We release our corpus of chain-of-thought samples and code.",
+    "authors": [
+      "Liunian Harold Li",
+      "Jack Hessel",
+      "Youngjae Yu",
+      "Xiang Ren",
+      "Kai-Wei Chang",
+      "Yejin Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.150",
+    "point2d": [
+      38.76264953613281,
+      -11.67507553100586
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 152,
+    "title": "Generating EDU Extracts for Plan-Guided Summary Re-Ranking",
+    "abstract": "Two-step approaches, in which summary candidates are generated-then-reranked to return a single summary, can improve ROUGE scores over the standard single-step approach. Yet, standard decoding methods (i.e., beam search, nucleus sampling, and diverse beam search) produce candidates with redundant, and often low quality, content. In this paper, we design a novel method to generate candidates for re-ranking that addresses these issues. We ground each candidate abstract on its own unique content plan and generate distinct plan-guided abstracts using a model\u2019s top beam. More concretely, a standard language model (a BART LM) auto-regressively generates elemental discourse unit (EDU) content plans with an extractive copy mechanism. The top K beams from the content plan generator are then used to guide a separate LM, which produces a single abstractive candidate for each distinct plan. We apply an existing re-ranker (BRIO) to abstractive candidates generated from our method, as well as baseline decoding methods. We show large relevance improvements over previously published methods on widely used single document news article corpora, with ROUGE-2 F1 gains of 0.88, 2.01, and 0.38 on CNN / Dailymail, NYT, and Xsum, respectively. A human evaluation on CNN / DM validates these results. Similarly, on 1k samples from CNN / DM, we show that prompting GPT-3 to follow EDU plans outperforms sampling-based methods by by 1.05 ROUGE-2 F1 points. Code to generate and realize plans is available at https://github.com/griff4692/edu-sum.",
+    "authors": [
+      "Griffin Adams",
+      "Alex Fabbri",
+      "Faisal Ladhak",
+      "No\u00e9mie Elhadad",
+      "Kathleen McKeown"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.151",
+    "point2d": [
+      -6.540194511413574,
+      39.85152816772461
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 153,
+    "title": "A Survey on Asking Clarification Questions Datasets in Conversational Systems",
+    "abstract": "The ability to understand a user\u2019s underlying needs is critical for conversational systems, especially with limited input from users in a conversation. Thus, in such a domain, Asking Clarification Questions (ACQs) to reveal users\u2019 true intent from their queries or utterances arise as an essential task. However, it is noticeable that a key limitation of the existing ACQs studies is their incomparability, from inconsistent use of data, distinct experimental setups and evaluation strategies. Therefore, in this paper, to assist the development of ACQs techniques, we comprehensively analyse the current ACQs research status, which offers a detailed comparison of publicly available datasets, and discusses the applied evaluation metrics, joined with benchmarks for multiple ACQs-related tasks. In particular, given a thorough analysis of the ACQs task, we discuss a number of corresponding research directions for the investigation of ACQs as well as the development of conversational systems.",
+    "authors": [
+      "Hossein A. Rahmani",
+      "Xi Wang",
+      "Yue Feng",
+      "Qiang Zhang",
+      "Emine Yilmaz",
+      "Aldo Lipani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.152",
+    "point2d": [
+      15.50158977508545,
+      52.33565139770508
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 154,
+    "title": "Towards Understanding Chain-of-Thought Prompting: An Empirical Study of What Matters",
+    "abstract": "Chain-of-Thought (CoT) prompting can dramatically improve the multi-step reasoning abilities of large language models (LLMs). CoT explicitly encourages the LLM to generate intermediate rationales for solving a problem, by providing a series of reasoning steps in the demonstrations. Despite its success, there is still little understanding of what makes CoT prompting effective and which aspects of the demonstrated reasoning steps contribute to its performance. In this paper, we show that CoT reasoning is possible even with invalid demonstrations - prompting with invalid reasoning steps can achieve over 80-90% of the performance obtained using CoT under various metrics, while still generating coherent lines of reasoning during inference. Further experiments show that other aspects of the rationales, such as being relevant to the query and correctly ordering the reasoning steps, are much more important for effective CoT reasoning. Overall, these findings both deepen our understanding of CoT prompting, and open up new questions regarding LLMs\u2019 capability to learn to reason in context.",
+    "authors": [
+      "Boshi Wang",
+      "Sewon Min",
+      "Xiang Deng",
+      "Jiaming Shen",
+      "You Wu",
+      "Luke Zettlemoyer",
+      "Huan Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.153",
+    "point2d": [
+      46.57187271118164,
+      -11.21002197265625
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 155,
+    "title": "Small Data, Big Impact: Leveraging Minimal Data for Effective Machine Translation",
+    "abstract": "For many languages, machine translation progress is hindered by the lack of reliable training data. Models are trained on whatever pre-existing datasets may be available and then augmented with synthetic data, because it is often not economical to pay for the creation of large-scale datasets. But for the case of low-resource languages, would the creation of a few thousand professionally translated sentence pairs give any benefit? In this paper, we show that it does.We describe a broad data collection effort involving around 6k professionally translated sentence pairs for each of 39 low-resource languages, which we make publicly available. We analyse the gains of models trained on this small but high-quality data, showing that it has significant impact even when larger but lower quality pre-existing corpora are used, or when data is augmented with millions of sentences through backtranslation.",
+    "authors": [
+      "Jean Maillard",
+      "Cynthia Gao",
+      "Elahe Kalbassi",
+      "Kaushik Ram Sadagopan",
+      "Vedanuj Goswami",
+      "Philipp Koehn",
+      "Angela Fan",
+      "Francisco Guzman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.154",
+    "point2d": [
+      -67.65005493164062,
+      -2.3086330890655518
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 156,
+    "title": "RMLM: A Flexible Defense Framework for Proactively Mitigating Word-level Adversarial Attacks",
+    "abstract": "Adversarial attacks on deep neural networks keep raising security concerns in natural language processing research. Existing defenses focus on improving the robustness of the victim model in the training stage. However, they often neglect to proactively mitigate adversarial attacks during inference. Towards this overlooked aspect, we propose a defense framework that aims to mitigate attacks by confusing attackers and correcting adversarial contexts that are caused by malicious perturbations. Our framework comprises three components: (1) a synonym-based transformation to randomly corrupt adversarial contexts in the word level, (2) a developed BERT defender to correct abnormal contexts in the representation level, and (3) a simple detection method to filter out adversarial examples, any of which can be flexibly combined. Additionally, our framework helps improve the robustness of the victim model during training. Extensive experiments demonstrate the effectiveness of our framework in defending against word-level adversarial attacks.",
+    "authors": [
+      "Zhaoyang Wang",
+      "Zhiyue Liu",
+      "Xiaopeng Zheng",
+      "Qinliang Su",
+      "Jiahai Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.155",
+    "point2d": [
+      4.996512413024902,
+      7.901086330413818
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 157,
+    "title": "Gradient-based Intra-attention Pruning on Pre-trained Language Models",
+    "abstract": "Pre-trained language models achieve superior performance but are computationally expensive. Techniques such as pruning and knowledge distillation have been developed to reduce their sizes and latencies. In this work, we propose a structured pruning method GRAIN (gradient-based intra-attention pruning), which performs task-specific pruning with knowledge distillation and yields highly effective models. Different from common approaches that prune each attention head as a whole, GRAIN inspects and prunes intra-attention structures, which greatly expands the structure search space and enables more flexible models. We also propose a gradient separation strategy that reduces the interference of distillation on pruning for a better combination of the two approaches. Experiments on GLUE, SQuAD, and CoNLL 2003 show that GRAIN notably outperforms other methods, especially in the high sparsity regime, and achieves 6 7x speedups while maintaining 93% 99% performance. Under extreme compression where only 3% transformer weights remain, the pruned model is still competitive compared to larger models.",
+    "authors": [
+      "Ziqing Yang",
+      "Yiming Cui",
+      "Xin Yao",
+      "Shijin Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.156",
+    "point2d": [
+      -39.263553619384766,
+      -21.693893432617188
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 158,
+    "title": "Learning to Substitute Spans towards Improving Compositional Generalization",
+    "abstract": "Despite the rising prevalence of neural sequence models, recent empirical evidences suggest their deficiency in compositional generalization. One of the current de-facto solutions to this problem is compositional data augmentation, aiming to incur additional compositional inductive bias. Nonetheless, the improvement offered by existing handcrafted augmentation strategies is limited when successful systematic generalization of neural sequence models requires multi-grained compositional bias (i.e., not limited to either lexical or structural biases only) or differentiation of training sequences in an imbalanced difficulty distribution. To address the two challenges, we first propose a novel compositional augmentation strategy dubbed Span Substitution (SpanSub) that enables multi-grained composition of substantial substructures in the whole training set. Over and above that, we introduce the Learning to Substitute Span (L2S2) framework which empowers the learning of span substitution probabilities in SpanSub in an end-to-end manner by maximizing the loss of neural sequence models, so as to outweigh those challenging compositions with elusive concepts and novel surroundings. Our empirical results on three standard compositional generalization benchmarks, including SCAN, COGS and GeoQuery (with an improvement of at most 66.5%, 10.3%, 1.2%, respectively), demonstrate the superiority of SpanSub, L2S2 and their combination.",
+    "authors": [
+      "Zhaoyi Li",
+      "Ying Wei",
+      "Defu Lian"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.157",
+    "point2d": [
+      -21.61821746826172,
+      -51.15730667114258
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 159,
+    "title": "DiffusEmp: A Diffusion Model-Based Framework with Multi-Grained Control for Empathetic Response Generation",
+    "abstract": "Empathy is a crucial factor in open-domain conversations, which naturally shows one\u2019s caring and understanding to others. Though several methods have been proposed to generate empathetic responses, existing works often lead to monotonous empathy that refers to generic and safe expressions. In this paper, we propose to use explicit control to guide the empathy expression and design a framework DiffusEmp based on conditional diffusion language model to unify the utilization of dialogue context and attribute-oriented control signals. Specifically, communication mechanism, intent, and semantic frame are imported as multi-grained signals that control the empathy realization from coarse to fine levels. We then design a specific masking strategy to reflect the relationship between multi-grained signals and response tokens, and integrate it into the diffusion model to influence the generative process. Experimental results on a benchmark dataset EmpatheticDialogue show that our framework outperforms competitive baselines in terms of controllability, informativeness, and diversity without the loss of context-relatedness.",
+    "authors": [
+      "Guanqun Bi",
+      "Lei Shen",
+      "Yanan Cao",
+      "Meng Chen",
+      "Yuqiang Xie",
+      "Zheng Lin",
+      "Xiaodong He"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.158",
+    "point2d": [
+      25.9416446685791,
+      64.07491302490234
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 160,
+    "title": "BREAK: Breaking the Dialogue State Tracking Barrier with Beam Search and Re-ranking",
+    "abstract": "Despite the recent advances in dialogue state tracking (DST), the joint goal accuracy (JGA) of the existing methods on MultiWOZ 2.1 still remains merely 60%. In our preliminary error analysis, we find that beam search produces a pool of candidates that is likely to include the correct dialogue state. Motivated by this observation, we introduce a novel framework, called BREAK (Beam search and RE-rAnKing), that achieves outstanding performance on DST. BREAK performs DST in two stages: (i) generating k-best dialogue state candidates with beam search and (ii) re-ranking the candidates to select the correct dialogue state. This simple yet powerful framework shows state-of-the-art performance on all versions of MultiWOZ and M2M datasets. Most notably, we push the joint goal accuracy to 80-90% on MultiWOZ 2.1-2.4, which is an improvement of 23.6%, 26.3%, 21.7%, and 10.8% over the previous best-performing models, respectively. The data and code will be available at https://github.com/tony-won/DST-BREAK",
+    "authors": [
+      "Seungpil Won",
+      "Heeyoung Kwak",
+      "Joongbo Shin",
+      "Janghoon Han",
+      "Kyomin Jung"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.159",
+    "point2d": [
+      3.04176664352417,
+      72.86465454101562
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 161,
+    "title": "Faithful Low-Resource Data-to-Text Generation through Cycle Training",
+    "abstract": "Methods to generate text from structured data have advanced significantly in recent years, primarily due to fine-tuning of pre-trained language models on large datasets. However, such models can fail to produce output faithful to the input data, particularly on out-of-domain data. Sufficient annotated data is often not available for specific domains, leading us to seek an unsupervised approach to improve the faithfulness of output text. Since the problem is fundamentally one of consistency between the representations of the structured data and text, we evaluate the effectiveness of cycle training in this work. Cycle training uses two models which are inverses of each other: one that generates text from structured data, and one which generates the structured data from natural language text. We show that cycle training, when initialized with a small amount of supervised data (100 samples in our case), achieves nearly the same performance as fully supervised approaches for the data-to-text generation task on the WebNLG, E2E, WTQ, and WSQL datasets. We perform extensive empirical analysis with automated evaluation metrics and a newly designed human evaluation schema to reveal different cycle training strategies\u2019 effectiveness of reducing various types of generation errors.Our code is publicly available at https://github.com/Edillower/CycleNLG.",
+    "authors": [
+      "Zhuoer Wang",
+      "Marcus Collins",
+      "Nikhita Vedula",
+      "Simone Filice",
+      "Shervin Malmasi",
+      "Oleg Rokhlenko"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.160",
+    "point2d": [
+      -6.939641952514648,
+      -46.89628219604492
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 162,
+    "title": "Towards Stable Natural Language Understanding via Information Entropy Guided Debiasing",
+    "abstract": "Although achieving promising performance, current Natural Language Understanding models tend to utilize dataset biases instead of learning the intended task, which always leads to performance degradation on out-of-distribution (OOD) samples. Toincrease the performance stability, previous debiasing methods empirically capture bias features from data to prevent the model from corresponding biases. However, our analyses show that the empirical debiasing methods may fail to capture part of the potential dataset biases and mistake semantic information of input text as biases, which limits the effectiveness of debiasing. To address these issues, we propose a debiasing framework IEGDB that comprehensively detects the dataset biases to induce a set of biased features, and then purifies the biased features with the guidance of information entropy. Experimental results show that IEGDB can consistently improve the stability of performance on OOD datasets for a set of widely adopted NLU models.",
+    "authors": [
+      "Li Du",
+      "Xiao Ding",
+      "Zhouhao Sun",
+      "Ting Liu",
+      "Bing Qin",
+      "Jingshuo Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.161",
+    "point2d": [
+      2.4625141620635986,
+      -8.459734916687012
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 163,
+    "title": "Dynamic and Efficient Inference for Text Generation via BERT Family",
+    "abstract": "Despite the excellent performance of Pre-trained Language Models on many text generation tasks, they suffer from inefficient inference on computation and memory due to their large-scale parameters and the universal autoregressive decoding paradigm. In this work, we propose a novel fine-tuning method DEER, which can make a single pre-trained model support Dynamic and Efficient infERence and achieve an adaptive trade-off between model performance and latency. In particular, our critical insight is to jointly utilize the non-autoregressive (NAR) generation and dynamic parameter pruning techniques, which can flexibly control the decoding iteration steps and model sizes according to memory and latency limitations. Besides, we also explore the effectiveness of the pre-trained MLMs (i.e., the BERT family) for text generation tasks since their bidirectional attention nature is more suitable for the NAR training objective. Extensive experiments on both monolingual and multilingual pre-trained MLMs demonstrate the effectiveness of our proposed DEER method by consistently achieving (1) higher BLEU scores than the strong autoregressive Transformer model on three neural machine translation tasks with 3 \\to 12 times speedup, (2) competitive performance (but with much faster inference speed) compared with the BART model on four GLGE benchmark tasks. Our code will be publicly available at GitHubhttps://github.com/dropreg/DEER.",
+    "authors": [
+      "Xiaobo Liang",
+      "Juntao Li",
+      "Lijun Wu",
+      "Ziqiang Cao",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.162",
+    "point2d": [
+      -30.07240104675293,
+      9.909956932067871
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 164,
+    "title": "Learning to Generate Equitable Text in Dialogue from Biased Training Data",
+    "abstract": "The ingrained principles of fairness in a dialogue system\u2019s decision-making process and generated responses are crucial for user engagement, satisfaction, and task achievement. Absence of equitable and inclusive principles can hinder the formation of common ground, which in turn negatively impacts the overall performance of the system. For example, misusing pronouns in a user interaction may cause ambiguity about the intended subject. Yet, there is no comprehensive study of equitable text generation in dialogue. Aptly, in this work, we use theories of computational learning to study this problem. We provide formal definitions of equity in text generation, and further, prove formal connections between learning human-likeness and learning equity: algorithms for improving equity ultimately reduce to algorithms for improving human-likeness (on augmented data). With this insight, we also formulate reasonable conditions under which text generation algorithms can learn to generate equitable text without any modifications to the biased training data on which they learn. To exemplify our theory in practice, we look at a group of algorithms for the GuessWhat?! visual dialogue game and, using this example, test our theory empirically. Our theory accurately predicts relative-performance of multiple algorithms in generating equitable text as measured by both human and automated evaluation.",
+    "authors": [
+      "Anthony Sicilia",
+      "Malihe Alikhani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.163",
+    "point2d": [
+      26.31662368774414,
+      39.60507583618164
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 165,
+    "title": "Hierarchical Verbalizer for Few-Shot Hierarchical Text Classification",
+    "abstract": "Due to the complex label hierarchy and intensive labeling cost in practice, the hierarchical text classification (HTC) suffers a poor performance especially when low-resource or few-shot settings are considered. Recently, there is a growing trend of applying prompts on pre-trained language models (PLMs), which has exhibited effectiveness in the few-shot flat text classification tasks. However, limited work has studied the paradigm of prompt-based learning in the HTC problem when the training data is extremely scarce. In this work, we define a path-based few-shot setting and establish a strict path-based evaluation metric to further explore few-shot HTC tasks. To address the issue, we propose the hierarchical verbalizer (\u201cHierVerb\u201d), a multi-verbalizer framework treating HTC as a single- or multi-label classification problem at multiple layers and learning vectors as verbalizers constrained by hierarchical structure and hierarchical contrastive learning. In this manner, HierVerb fuses label hierarchy knowledge into verbalizers and remarkably outperforms those who inject hierarchy through graph encoders, maximizing the benefits of PLMs. Extensive experiments on three popular HTC datasets under the few-shot settings demonstrate that prompt with HierVerb significantly boosts the HTC performance, meanwhile indicating an elegant way to bridge the gap between the large pre-trained model and downstream hierarchical classification tasks.",
+    "authors": [
+      "Ke Ji",
+      "Yixin Lian",
+      "Jingsheng Gao",
+      "Baoyuan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.164",
+    "point2d": [
+      3.150406837463379,
+      -24.61390495300293
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 166,
+    "title": "Summary-Oriented Vision Modeling for Multimodal Abstractive Summarization",
+    "abstract": "The goal of multimodal abstractive summarization (MAS) is to produce a concise summary given the multimodal data (text and vision). Existing studies on MAS mainly focus on how to effectively use the extracted visual features, having achieved impressive success on the high-resource English dataset. However, less attention has been paid to the quality of the visual features to the summary, which may limit the model performance, especially in the low- and zero-resource scenarios. In this paper, we propose to improve the summary quality through summary-oriented visual features. To this end, we devise two auxiliary tasks including vision to summary task and masked image modeling task. Together with the main summarization task, we optimize the MAS model via the training objectives of all these tasks. By these means, the MAS model can be enhanced by capturing the summary-oriented visual features, thereby yielding more accurate summaries. Experiments on 44 languages, covering mid-high-, low-, and zero-resource scenarios, verify the effectiveness and superiority of the proposed approach, which achieves state-of-the-art performance under all scenarios. Additionally, we will contribute a large-scale multilingual multimodal abstractive summarization (MM-Sum) dataset to the research community.",
+    "authors": [
+      "Yunlong Liang",
+      "Fandong Meng",
+      "Jinan Xu",
+      "Jiaan Wang",
+      "Yufeng Chen",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.165",
+    "point2d": [
+      -40.98481369018555,
+      54.31181716918945
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 167,
+    "title": "Helping a Friend or Supporting a Cause? Disentangling Active and Passive Cosponsorship in the U.S. Congress",
+    "abstract": "In the U.S. Congress, legislators can use active and passive cosponsorship to support bills.We show that these two types of cosponsorship are driven by two different motivations: the backing of political colleagues and the backing of the bill\u2019s content.To this end, we develop an Encoder+RGCN based model that learns legislator representations from bill texts and speech transcripts. These representations predict active and passive cosponsorship with an F1-score of 0.88.Applying our representations to predict voting decisions, we show that they are interpretable and generalize to unseen tasks.",
+    "authors": [
+      "Giuseppe Russo",
+      "Christoph Gote",
+      "Laurence Brandenberger",
+      "Sophia Schlosser",
+      "Frank Schweitzer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.166",
+    "point2d": [
+      28.45566749572754,
+      28.97747802734375
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 168,
+    "title": "TREA: Tree-Structure Reasoning Schema for Conversational Recommendation",
+    "abstract": "Conversational recommender systems (CRS) aim to timely trace the dynamic interests of users through dialogues and generate relevant responses for item recommendations. Recently, various external knowledge bases (especially knowledge graphs) are incorporated into CRS to enhance the understanding of conversation contexts. However, recent reasoning-based models heavily rely on simplified structures such as linear structures or fixed-hierarchical structures for causality reasoning, hence they cannot fully figure out sophisticated relationships among utterances with external knowledge. To address this, we propose a novel Tree structure Reasoning schEmA named TREA. TREA constructs a multi-hierarchical scalable tree as the reasoning structure to clarify the causal relationships between mentioned entities, and fully utilizes historical conversations to generate more reasonable and suitable responses for recommended results. Extensive experiments on two public CRS datasets have demonstrated the effectiveness of our approach.",
+    "authors": [
+      "Wendi Li",
+      "Wei Wei",
+      "Xiaoye Qu",
+      "Xian-Ling Mao",
+      "Ye Yuan",
+      "Wenfeng Xie",
+      "Dangyang Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.167",
+    "point2d": [
+      22.225372314453125,
+      53.50389099121094
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 169,
+    "title": "CATS: A Pragmatic Chinese Answer-to-Sequence Dataset with Large Scale and High Quality",
+    "abstract": "There are three problems existing in the popular data-to-text datasets. First, the large-scale datasets either contain noise or lack real application scenarios. Second, the datasets close to real applications are relatively small in size. Last, current datasets bias in the English language while leaving other languages underexplored.To alleviate these limitations, in this paper, we present CATS, a pragmatic Chinese answer-to-sequence dataset with large scale and high quality. The dataset aims to generate textual descriptions for the answer in the practical TableQA system.Further, to bridge the structural gap between the input SQL and table and establish better semantic alignments, we propose a Unified Graph Transformation approach to establish a joint encoding space for the two hybrid knowledge resources and convert this task to a graph-to-text problem. The experiment results demonstrate the effectiveness of our proposed method. Further analysis on CATS attests to both the high quality and challenges of the dataset",
+    "authors": [
+      "Liang Li",
+      "Ruiying Geng",
+      "Chengyang Fang",
+      "Bing Li",
+      "Can Ma",
+      "Rongyu Cao",
+      "Binhua Li",
+      "Fei Huang",
+      "Yongbin Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.168",
+    "point2d": [
+      73.57933044433594,
+      3.5854809284210205
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 170,
+    "title": "Multilingual Multifaceted Understanding of Online News in Terms of Genre, Framing, and Persuasion Techniques",
+    "abstract": "We present a new multilingual multifacet dataset of news articles, each annotated for genre (objective news reporting vs. opinion vs. satire), framing (what key aspects are highlighted), and persuasion techniques (logical fallacies, emotional appeals, ad hominem attacks, etc.). The persuasion techniques are annotated at the span level, using a taxonomy of 23 fine-grained techniques grouped into 6 coarse categories. The dataset contains 1,612 news articles covering recent news on current topics of public interest in six European languages (English, French, German, Italian, Polish, and Russian), with more than 37k annotated spans of persuasion techniques. We describe the dataset and the annotation process, and we report the evaluation results of multilabel classification experiments using state-of-the-art multilingual transformers at different levels of granularity: token-level, sentence-level, paragraph-level, and document-level.",
+    "authors": [
+      "Jakub Piskorski",
+      "Nicolas Stefanovitch",
+      "Nikolaos Nikolaidis",
+      "Giovanni Da San Martino",
+      "Preslav Nakov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.169",
+    "point2d": [
+      37.09701919555664,
+      35.88051986694336
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 171,
+    "title": "Learning Action Conditions from Instructional Manuals for Instruction Understanding",
+    "abstract": "The ability to infer pre- and postconditions of an action is vital for comprehending complex instructions, and is essential for applications such as autonomous instruction-guided agents and assistive AI that supports humans to perform physical tasks.In this work, we propose a task dubbed action condition inference, which extracts mentions of preconditions and postconditions of actions in instructional manuals.We propose a weakly supervised approach utilizing automatically constructed large-scale training instances from online instructions, and curate a densely human-annotated and validated dataset to study how well the current NLP models do on the proposed task.We design two types of models differ by whether contextualized and global information is leveraged, as well as various combinations of heuristics to construct the weak supervisions.Our experiments show a > 20% F1-score improvement with considering the entire instruction contexts and a > 6% F1-score benefit with the proposed heuristics. However, the best performing model is still well-behind human performance.",
+    "authors": [
+      "Te-Lin Wu",
+      "Caiqi Zhang",
+      "Qingyuan Hu",
+      "Alexander Spangher",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.170",
+    "point2d": [
+      -20.00075340270996,
+      -21.712949752807617
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 172,
+    "title": "StoryWars: A Dataset and Instruction Tuning Baselines for Collaborative Story Understanding and Generation",
+    "abstract": "Collaborative stories, which are texts created through the collaborative efforts of multiple authors with different writing styles and intentions, pose unique challenges for NLP models. Understanding and generating such stories remains an underexplored area due to the lack of open-domain corpora.To address this, we introduce StoryWars, a new dataset of over 40,000 collaborative stories written by 9,400 different authors from an online platform. We design 12 task types, comprising 7 understanding and 5 generation task types, on {pasted macro \u2018STORYWARS\u2019}, deriving 101 diverse story-related tasks in total as a multi-task benchmark covering all fully-supervised, few-shot, and zero-shot scenarios.Furthermore, we present our instruction-tuned model, InstructStory, for the story tasks showing that instruction tuning, in addition to achieving superior results in zero-shot and few-shot scenarios, can also obtain the best performance on the fully-supervised tasks in StoryWars, establishing strong multi-task benchmark performances on StoryWars.",
+    "authors": [
+      "Yulun Du",
+      "Lydia Chilton"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.171",
+    "point2d": [
+      -23.696035385131836,
+      43.74521255493164
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 173,
+    "title": "Did You Read the Instructions? Rethinking the Effectiveness of Task Definitions in Instruction Learning",
+    "abstract": "Large language models (LLMs) have shown impressive performance in following natural language instructions to solve unseen tasks. However, it remains unclear whether models truly understand task definitions and whether the human-written definitions are optimal. In this paper, we systematically study the role of task definitions in instruction learning. We first conduct an ablation analysis informed by human annotations to understand which parts of a task definition are most important, and find that model performance only drops substantially when removing contents describing the task output, in particular label information. Next, we propose an automatic algorithm to compress task definitions to a minimal supporting set of tokens, and find that 60% of tokens can be removed while maintaining or even improving model performance. Based on these results, we propose two strategies to help models better leverage task instructions: (1) providing only key information for tasks in a common structured format, and (2) adding a meta-tuning stage to help the model better understand the definitions. With these two strategies, we achieve a 4.2 Rouge-L improvement over 119 unseen test tasks.",
+    "authors": [
+      "Fan Yin",
+      "Jesse Vig",
+      "Philippe Laban",
+      "Shafiq Joty",
+      "Caiming Xiong",
+      "Chien-Sheng Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.172",
+    "point2d": [
+      -19.688865661621094,
+      -20.21567153930664
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 174,
+    "title": "Do PLMs Know and Understand Ontological Knowledge?",
+    "abstract": "Ontological knowledge, which comprises classes and properties and their relationships, is integral to world knowledge. It is significant to explore whether Pretrained Language Models (PLMs) know and understand such knowledge. However, existing PLM-probing studies focus mainly on factual knowledge, lacking a system- atic probing of ontological knowledge. In this paper, we focus on probing whether PLMs store ontological knowledge and have a semantic un- derstanding of the knowledge rather than rote memorization of the surface form. To probe whether PLMs know ontological knowledge, we investigate how well PLMs memorize: (1) types of entities; (2) hierarchical relationships among classes and properties, e.g., Person is a subclass of Animal and Member of Sports Team is a subproperty of Member of ; (3) domain and range constraints of properties, e.g., the subject of Member of Sports Team should be a Person and the object should be a Sports Team. To further probe whether PLMs truly understand ontological knowledge beyond memorization, we comprehensively study whether they can reliably perform logical reasoning with given knowledge according to ontological entailment rules. Our probing results show that PLMs can memorize certain ontological knowledge and utilize implicit knowledge in reasoning. How- ever, both the memorizing and reasoning per- formances are less than perfect, indicating in- complete knowledge and understanding.",
+    "authors": [
+      "Weiqi Wu",
+      "Chengyue Jiang",
+      "Yong Jiang",
+      "Pengjun Xie",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.173",
+    "point2d": [
+      44.03479766845703,
+      -2.427387237548828
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 175,
+    "title": "CORE: Cooperative Training of Retriever-Reranker for Effective Dialogue Response Selection",
+    "abstract": "Establishing retrieval-based dialogue systems that can select appropriate responses from the pre-built index has gained increasing attention. Recent common practice is to construct a two-stage pipeline with a fast retriever (e.g., bi-encoder) for first-stage recall followed by a smart response reranker (e.g., cross-encoder) for precise ranking. However, existing studies either optimize the retriever and reranker in independent ways, or distill the knowledge from a pre-trained reranker into the retriever in an asynchronous way, leading to sub-optimal performance of both modules. Thus, an open question remains about how to train them for a better combination of the best of both worlds. To this end, we present a cooperative training of the response retriever and the reranker whose parameters are dynamically optimized by the ground-truth labels as well as list-wise supervision signals from each other. As a result, the two modules can learn from each other and evolve together throughout the training. Experimental results on two benchmarks demonstrate the superiority of our method.",
+    "authors": [
+      "Chongyang Tao",
+      "Jiazhan Feng",
+      "Tao Shen",
+      "Chang Liu",
+      "Juntao Li",
+      "Xiubo Geng",
+      "Daxin Jiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.174",
+    "point2d": [
+      17.765056610107422,
+      59.465457916259766
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 176,
+    "title": "Exploring How Generative Adversarial Networks Learn Phonological Representations",
+    "abstract": "This paper explores how Generative Adversarial Networks (GANs) learn representations of phonological phenomena. We analyze how GANs encode contrastive and non-contrastive nasality in French and English vowels by applying the ciwGAN architecture (Begus, 2021). Begus claims that ciwGAN encodes linguistically meaningful representations with categorical variables in its latent space and manipulating the latent variables shows an almost one to one corresponding control of the phonological features in ciwGAN\u2019s generated outputs. However, our results show an interactive effect of latent variables on the features in the generated outputs, which suggests the learned representations in neural networks are different from the phonological representations proposed by linguists. On the other hand, ciwGAN is able to distinguish contrastive and noncontrastive features in English and French by encoding them differently. Comparing the performance of GANs learning from different languages results in a better understanding of what language specific features contribute to developing language specific phonological representations. We also discuss the role of training data frequencies in phonological feature learning.",
+    "authors": [
+      "Jingyi Chen",
+      "Micha Elsner"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.175",
+    "point2d": [
+      -17.874828338623047,
+      1.216254472732544
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 177,
+    "title": "Interpretable Word Sense Representations via Definition Generation: The Case of Semantic Change Analysis",
+    "abstract": "We propose using automatically generated natural language definitions of contextualised word usages as interpretable word and word sense representations.Given a collection of usage examples for a target word, and the corresponding data-driven usage clusters (i.e., word senses), a definition is generated for each usage with a specialised Flan-T5 language model, and the most prototypical definition in a usage cluster is chosen as the sense label. We demonstrate how the resulting sense labels can make existing approaches to semantic change analysis more interpretable, and how they can allow users \u2014 historical linguists, lexicographers, or social scientists \u2014 to explore and intuitively explain diachronic trajectories of word meaning. Semantic change analysis is only one of many possible applications of the \u2018definitions as representations\u2019 paradigm. Beyond being human-readable, contextualised definitions also outperform token or usage sentence embeddings in word-in-context semantic similarity judgements, making them a new promising type of lexical representation for NLP.",
+    "authors": [
+      "Mario Giulianelli",
+      "Iris Luden",
+      "Raquel Fernandez",
+      "Andrey Kutuzov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.176",
+    "point2d": [
+      4.855363845825195,
+      -46.7413444519043
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 178,
+    "title": "Learning to Simulate Natural Language Feedback for Interactive Semantic Parsing",
+    "abstract": "Interactive semantic parsing based on natural language (NL) feedback, where users provide feedback to correct the parser mistakes, has emerged as a more practical scenario than the traditional one-shot semantic parsing. However, prior work has heavily relied on human-annotated feedback data to train the interactive semantic parser, which is prohibitively expensive and not scalable. In this work, we propose a new task of simulating NL feedback for interactive semantic parsing. We accompany the task with a novel feedback evaluator. The evaluator is specifically designed to assess the quality of the simulated feedback, based on which we decide the best feedback simulator from our proposed variants. On a text-to-SQL dataset, we show that our feedback simulator can generate high-quality NL feedback to boost the error correction ability of a specific parser. In low-data settings, our feedback simulator can help achieve comparable error correction performance as trained using the costly, full set of human annotations.",
+    "authors": [
+      "Hao Yan",
+      "Saurabh Srivastava",
+      "Yintao Tai",
+      "Sida I. Wang",
+      "Wen-tau Yih",
+      "Ziyu Yao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.177",
+    "point2d": [
+      -31.49121856689453,
+      -57.21141815185547
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 179,
+    "title": "InfoMetIC: An Informative Metric for Reference-free Image Caption Evaluation",
+    "abstract": "Automatic image captioning evaluation is critical for benchmarking and promoting advances in image captioning research. Existing metrics only provide a single score to measure caption qualities, which are less explainable and informative. Instead, we humans can easily identify the problems of captions in details, e.g., which words are inaccurate and which salient objects are not described, and then rate the caption quality. To support such informative feedback, we propose an Informative Metric for Reference-free Image Caption evaluation (InfoMetIC). Given an image and a caption, InfoMetIC is able to report incorrect words and unmentioned image regions at fine-grained level, and also provide a text precision score, a vision recall score and an overall quality score at coarse-grained level. The coarse-grained score of InfoMetIC achieves significantly better correlation with human judgements than existing metrics on multiple benchmarks. We also construct a token-level evaluation dataset and demonstrate the effectiveness of InfoMetIC in fine-grained evaluation. Our code and datasets are publicly available at https://github.com/HAWLYQ/InfoMetIC.",
+    "authors": [
+      "Anwen Hu",
+      "Shizhe Chen",
+      "Liang Zhang",
+      "Qin Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.178",
+    "point2d": [
+      -58.162635803222656,
+      45.610694885253906
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 180,
+    "title": "An Invariant Learning Characterization of Controlled Text Generation",
+    "abstract": "Controlled generation refers to the problem of creating text that contains stylistic or semantic attributes of interest. Many approaches reduce this problem to training a predictor of the desired attribute. For example, researchers hoping to deploy a large language model to produce non-toxic content may use a toxicity classifier to filter generated text. In practice, the generated text to classify, which is determined by user prompts, may come from a wide range of distributions.In this paper, we show that the performance of controlled generation may be poor if the distributions of text in response to user prompts differ from the distribution the predictor was trained on. To address this problem, we cast controlled generation under distribution shift as an invariant learning problem: the most effective predictor should be invariant across multiple text environments. We then discuss a natural solution that arises from this characterization and propose heuristics for selecting natural environments.We study this characterization and the proposed method empirically using both synthetic and real data. Experiments demonstrate both the challenge of distribution shift in controlled generation and the potential of invariance methods in this setting.",
+    "authors": [
+      "Carolina Zheng",
+      "Claudia Shi",
+      "Keyon Vafa",
+      "Amir Feder",
+      "David Blei"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.179",
+    "point2d": [
+      -21.188188552856445,
+      6.597432613372803
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 181,
+    "title": "HistRED: A Historical Document-Level Relation Extraction Dataset",
+    "abstract": "Despite the extensive applications of relation extraction (RE) tasks in various domains, little has been explored in the historical context, which contains promising data across hundreds and thousands of years. To promote the historical RE research, we present HistRED constructed from Yeonhaengnok. Yeonhaengnok is a collection of records originally written in Hanja, the classical Chinese writing, which has later been translated into Korean. HistRED provides bilingual annotations such that RE can be performed on Korean and Hanja texts. In addition, HistRED supports various self-contained subtexts with different lengths, from a sentence level to a document level, supporting diverse context settings for researchers to evaluate the robustness of their RE models. To demonstrate the usefulness of our dataset, we propose a bilingual RE model that leverages both Korean and Hanja contexts to predict relations between entities. Our model outperforms monolingual baselines on HistRED, showing that employing multiple language contexts supplements the RE predictions. The dataset is publicly available at: https://huggingface.co/datasets/Soyoung/HistRED under CC BY-NC-ND 4.0 license.",
+    "authors": [
+      "Soyoung Yang",
+      "Minseok Choi",
+      "Youngwoo Cho",
+      "Jaegul Choo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.180",
+    "point2d": [
+      43.696571350097656,
+      -62.22819137573242
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 182,
+    "title": "A Critical Evaluation of Evaluations for Long-form Question Answering",
+    "abstract": "Long-form question answering (LFQA) enables answering a wide range of questions, but its flexibility poses enormous challenges for evaluation. We perform the first targeted study of the evaluation of long-form answers, covering both human and automatic evaluation practices. We hire domain experts in seven areas to provide preference judgments over pairs of answers, along with free-form justifications for their choices. We present a careful analysis of experts\u2019 evaluation, which focuses on new aspects such as the comprehensiveness of the answer. Next, we examine automatic text generation metrics, finding that no existing metrics are predictive of human preference judgments. However, some metrics correlate with fine-grained aspects of answers (e.g., coherence). We encourage future work to move away from a single \u201coverall score\u201d of the answer and adopt a multi-faceted evaluation, targeting aspects such as factuality and completeness. We publicly release all of our annotations and code to spur future work into LFQA evaluation.",
+    "authors": [
+      "Fangyuan Xu",
+      "Yixiao Song",
+      "Mohit Iyyer",
+      "Eunsol Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.181",
+    "point2d": [
+      68.06562042236328,
+      9.74749755859375
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 183,
+    "title": "HyPe: Better Pre-trained Language Model Fine-tuning with Hidden Representation Perturbation",
+    "abstract": "Language models with the Transformers structure have shown great performance in natural language processing.However, there still poses problems when fine-tuning pre-trained language models on downstream tasks, such as over-fitting or representation collapse.In this work, we propose HyPe, a simple yet effective fine-tuning technique to alleviate such problems by perturbing hidden representations of Transformers layers. Unlike previous works that only add noise to inputs or parameters, we argue that the hidden representations of Transformers layers convey more diverse and meaningful language information. Therefore, making the Transformers layers more robust to hidden representation perturbations can further benefit the fine-tuning of PLMs en bloc.We conduct extensive experiments and analyses on GLUE and other natural language inference datasets. Results demonstrate that HyPe outperforms vanilla fine-tuning and enhances generalization of hidden representations from different layers. In addition, HyPe acquires negligible computational overheads, and is better than and compatible with previous state-of-the-art fine-tuning techniques.",
+    "authors": [
+      "Hongyi Yuan",
+      "Zheng Yuan",
+      "Chuanqi Tan",
+      "Fei Huang",
+      "Songfang Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.182",
+    "point2d": [
+      -33.04744338989258,
+      -23.66374397277832
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 184,
+    "title": "Generating User-Engaging News Headlines",
+    "abstract": "The potential choices for news article headlines are enormous, and finding the right balance between conveying the essential message and capturing the reader\u2019s attention is key to effective headlining. However, presenting the same news headline to all readers is a suboptimal strategy, because it does not take into account the different preferences and interests of diverse readers, who may be confused about why a particular article has been recommended to them and do not see a clear connection between their interests and the recommended article. In this paper, we present a novel framework that addresses these challenges by incorporating user profiling to generate personalized headlines, and a combination of automated and human evaluation methods to determine user preference for personalized headlines. Our framework utilizes a learnable relevance function to assign personalized signature phrases to users based on their reading histories, which are then used to personalize headline generation. Through extensive evaluation, we demonstrate the effectiveness of our proposed framework in generating personalized headlines that meet the needs of a diverse audience. Our framework has the potential to improve the efficacy of news recommendations and facilitate creation of personalized content.",
+    "authors": [
+      "Pengshan Cai",
+      "Kaiqiang Song",
+      "Sangwoo Cho",
+      "Hongwei Wang",
+      "Xiaoyang Wang",
+      "Hong Yu",
+      "Fei Liu",
+      "Dong Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.183",
+    "point2d": [
+      -5.77017879486084,
+      37.974639892578125
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 185,
+    "title": "Word sense extension",
+    "abstract": "Humans often make creative use of words to expressnovel senses. A long-standing effort in natural language processing hasbeen focusing on word sense disambiguation (WSD), but little has been explored about how the sense inventory of a word may be extended toward novel meanings. We present a paradigm of word sense extension (WSE) thatenables words to spawn new senses toward novel context. We develop a framework that simulates novel word sense extension by first partitioning a polysemous word type into two pseudo-tokens that mark its different senses, and then inferring whether the meaning of a pseudo-token can be extended to convey the sense denoted by the token partitioned from the same word type. Our framework combines cognitivemodels of chaining with a learning scheme that transforms a language model embedding space to supportvarious types of word sense extension. We evaluate our frameworkagainst several competitive baselines and show that it is superior in predicting plausible novel senses for over 7,500 English words. Furthermore, we show that our WSE framework improves performance over a range of transformer-based WSD models in predicting rare word senses with few or zero mentions in the training data.",
+    "authors": [
+      "Lei Yu",
+      "Yang Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.184",
+    "point2d": [
+      4.146936416625977,
+      -47.08029556274414
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 186,
+    "title": "PVGRU: Generating Diverse and Relevant Dialogue Responses via Pseudo-Variational Mechanism",
+    "abstract": "We investigate response generation for multi-turn dialogue in generative chatbots. Existing generative modelsbased on RNNs (Recurrent Neural Networks) usually employ the last hidden state to summarize the history, which makesmodels unable to capture the subtle variability observed in different dialogues and cannot distinguish the differencesbetween dialogues that are similar in composition. In this paper, we propose Pseudo-Variational Gated Recurrent Unit (PVGRU). The key novelty of PVGRU is a recurrent summarizing variable thataggregates the accumulated distribution variations of subsequences. We train PVGRU without relying on posterior knowledge, thus avoiding the training-inference inconsistency problem. PVGRU can perceive subtle semantic variability through summarizing variables that are optimized by two objectives we employ for training: distribution consistency and reconstruction. In addition, we build a Pseudo-Variational Hierarchical Dialogue(PVHD) model based on PVGRU. Experimental results demonstrate that PVGRU can broadly improve the diversity andrelevance of responses on two benchmark datasets.",
+    "authors": [
+      "Yongkang Liu",
+      "Shi Feng",
+      "Daling Wang",
+      "Yifei Zhang",
+      "Hinrich Sch\u00fctze"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.185",
+    "point2d": [
+      9.474085807800293,
+      66.13720703125
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 187,
+    "title": "Decoding Symbolism in Language Models",
+    "abstract": "This work explores the feasibility of eliciting knowledge from language models (LMs) to decode symbolism, recognizing something (e.g.,roses) as a stand-in for another (e.g., love). We present our evaluative framework, Symbolism Analysis (SymbA), which compares LMs (e.g., RoBERTa, GPT-J) on different types of symbolism and analyze the outcomes along multiple metrics. Our findings suggest that conventional symbols are more reliably elicited from LMs while situated symbols are more challenging. Results also reveal the negative impact of the bias in pre-trained corpora. We further demonstrate that a simple re-ranking strategy can mitigate the bias and significantly improve model performances to be on par with human performances in some cases.",
+    "authors": [
+      "Meiqi Guo",
+      "Rebecca Hwa",
+      "Adriana Kovashka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.186",
+    "point2d": [
+      -32.31339645385742,
+      -34.92242431640625
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 188,
+    "title": "A Survey on Zero Pronoun Translation",
+    "abstract": "Zero pronouns (ZPs) are frequently omitted in pro-drop languages (e.g. Chinese, Hungarian, and Hindi), but should be recalled in non-pro-drop languages (e.g. English). This phenomenon has been studied extensively in machine translation (MT), as it poses a significant challenge for MT systems due to the difficulty in determining the correct antecedent for the pronoun. This survey paper highlights the major works that have been undertaken in zero pronoun translation (ZPT) after the neural revolution so that researchers can recognize the current state and future directions of this field. We provide an organization of the literature based on evolution, dataset, method, and evaluation. In addition, we compare and analyze competing models and evaluation metrics on different benchmarks. We uncover a number of insightful findings such as: 1) ZPT is in line with the development trend of large language model; 2) data limitation causes learning bias in languages and domains; 3) performance improvements are often reported on single benchmarks, but advanced methods are still far from real-world use; 4) general-purpose metrics are not reliable on nuances and complexities of ZPT, emphasizing the necessity of targeted metrics; 5) apart from commonly-cited errors, ZPs will cause risks of gender bias.",
+    "authors": [
+      "Longyue Wang",
+      "Siyou Liu",
+      "Mingzhou Xu",
+      "Linfeng Song",
+      "Shuming Shi",
+      "Zhaopeng Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.187",
+    "point2d": [
+      -54.82173538208008,
+      -0.36783888936042786
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 189,
+    "title": "We Understand Elliptical Sentences, and Language Models should Too: A New Dataset for Studying Ellipsis and its Interaction with Thematic Fit",
+    "abstract": "Ellipsis is a linguistic phenomenon characterized by the omission of one or more sentence elements. Solving such a linguistic construction is not a trivial issue in natural language processing since it involves the retrieval of non-overtly expressed verbal material, which might in turn require the model to integrate human-like syntactic and semantic knowledge. In this paper, we explored the issue of how the prototypicality of event participants affects the ability of Language Models (LMs) to handle elliptical sentences and to identify the omitted arguments at different degrees of thematic fit, ranging from highly typical participants to semantically anomalous ones. With this purpose in mind, we built ELLie, the first dataset composed entirely of utterances containing different types of elliptical constructions, and structurally suited for evaluating the effect of argument thematic fit in solving ellipsis and reconstructing the missing element. Our tests demonstrated that the probability scores assigned by the models are higher for typical events than for atypical and impossible ones in different elliptical contexts, confirming the influence of prototypicality of the event participants in interpreting such linguistic structures. Finally, we conducted a retrieval task of the elided verb in the sentence in which the low performance of LMs highlighted a considerable difficulty in reconstructing the correct event.",
+    "authors": [
+      "Davide Testa",
+      "Emmanuele Chersoni",
+      "Alessandro Lenci"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.188",
+    "point2d": [
+      -14.639710426330566,
+      -71.92085266113281
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 190,
+    "title": "MPCHAT: Towards Multimodal Persona-Grounded Conversation",
+    "abstract": "In order to build self-consistent personalized dialogue agents, previous research has mostly focused on textual persona that delivers personal facts or personalities. However, to fully describe the multi-faceted nature of persona, image modality can help better reveal the speaker\u2019s personal characteristics and experiences in episodic memory (Rubin et al., 2003; Conway, 2009). In this work, we extend persona-based dialogue to the multimodal domain and make two main contributions. First, we present the first multimodal persona-based dialogue dataset named MPCHAT, which extends persona with both text and images to contain episodic memories. Second, we empirically show that incorporating multimodal persona, as measured by three proposed multimodal persona-grounded dialogue tasks (i.e., next response prediction, grounding persona prediction, and speaker identification), leads to statistically significant performance improvements across all tasks. Thus, our work highlights that multimodal persona is crucial for improving multimodal dialogue comprehension, and our MPCHAT serves as a high-quality resource for this research.",
+    "authors": [
+      "Jaewoo Ahn",
+      "Yeda Song",
+      "Sangdoo Yun",
+      "Gunhee Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.189",
+    "point2d": [
+      10.791665077209473,
+      74.65906524658203
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 191,
+    "title": "DOC: Improving Long Story Coherence With Detailed Outline Control",
+    "abstract": "We propose the Detailed Outline Control (DOC) framework for improving long-range plot coherence when automatically generating several-thousand-word-long stories. DOC consists of two complementary components: a detailed outliner and a detailed controller. The detailed outliner creates a more detailed, hierarchically structured outline, shifting creative burden from the main drafting procedure to the planning stage. The detailed controller ensures the more detailed outline is still respected during generation by controlling story passages to align with outline details. In human evaluations of automatically generated stories, DOC substantially outperforms a strong Re3 baseline (Yang et al., 2022) on plot coherence (22.5% absolute gain), outline relevance (28.2%), and interestingness (20.7%). Humans also judged DOC to be much more controllable in an interactive generation setting.",
+    "authors": [
+      "Kevin Yang",
+      "Dan Klein",
+      "Nanyun Peng",
+      "Yuandong Tian"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.190",
+    "point2d": [
+      -24.524629592895508,
+      45.45175552368164
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 192,
+    "title": "Dual-Alignment Pre-training for Cross-lingual Sentence Embedding",
+    "abstract": "Recent studies have shown that dual encoder models trained with the sentence-level translation ranking task are effective methods for cross-lingual sentence embedding. However, our research indicates that token-level alignment is also crucial in multilingual scenarios, which has not been fully explored previously. Based on our findings, we propose a dual-alignment pre-training (DAP) framework for cross-lingual sentence embedding that incorporates both sentence-level and token-level alignment. To achieve this, we introduce a novel representation translation learning (RTL) task, where the model learns to use one-side contextualized token representation to reconstruct its translation counterpart. This reconstruction objective encourages the model to embed translation information into the token representation. Compared to other token-level alignment methods such as translation language modeling, RTL is more suitable for dual encoder architectures and is computationally efficient. Extensive experiments on three sentence-level cross-lingual benchmarks demonstrate that our approach can significantly improve sentence embedding. Our code is available at https://github.com/ChillingDream/DAP.",
+    "authors": [
+      "Ziheng Li",
+      "Shaohan Huang",
+      "Zihan Zhang",
+      "Zhi-Hong Deng",
+      "Qiang Lou",
+      "Haizhen Huang",
+      "Jian Jiao",
+      "Furu Wei",
+      "Weiwei Deng",
+      "Qi Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.191",
+    "point2d": [
+      -57.77947235107422,
+      -12.281811714172363
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 193,
+    "title": "Exploring Better Text Image Translation with Multimodal Codebook",
+    "abstract": "Text image translation (TIT) aims to translate the source texts embedded in the image to target translations, which has a wide range of applications and thus has important research value. However, current studies on TIT are confronted with two main bottlenecks: 1) this task lacks a publicly available TIT dataset, 2) dominant models are constructed in a cascaded manner, which tends to suffer from the error propagation of optical character recognition (OCR). In this work, we first annotate a Chinese-English TIT dataset named OCRMT30K, providing convenience for subsequent studies. Then, we propose a TIT model with a multimodal codebook, which is able to associate the image with relevant texts, providing useful supplementary information for translation. Moreover, we present a multi-stage training framework involving text machine translation, image-text alignment, and TIT tasks, which fully exploits additional bilingual texts, OCR dataset and our OCRMT30K dataset to train our model. Extensive experiments and in-depth analyses strongly demonstrate the effectiveness of our proposed model and training framework.",
+    "authors": [
+      "Zhibin Lan",
+      "Jiawei Yu",
+      "Xiang Li",
+      "Wen Zhang",
+      "Jian Luan",
+      "Bin Wang",
+      "Degen Huang",
+      "Jinsong Su"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.192",
+    "point2d": [
+      -68.40345764160156,
+      40.781227111816406
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 194,
+    "title": "FEDLEGAL: The First Real-World Federated Learning Benchmark for Legal NLP",
+    "abstract": "The inevitable private information in legal data necessitates legal artificial intelligence to study privacy-preserving and decentralized learning methods. Federated learning (FL) has merged as a promising technique for multiple participants to collaboratively train a shared model while efficiently protecting the sensitive data of participants. However, to the best of our knowledge, there is no work on applying FL to legal NLP. To fill this gap, this paper presents the first real-world FL benchmark for legal NLP, coined FEDLEGAL, which comprises five legal NLP tasks and one privacy task based on the data from Chinese courts. Based on the extensive experiments on these datasets, our results show that FL faces new challenges in terms of real-world non-IID data. The benchmark also encourages researchers to investigate privacy protection using real-world data in the FL setting, as well as deploying models in resource-constrained scenarios. The code and datasets of FEDLEGAL are available here.",
+    "authors": [
+      "Zhuo Zhang",
+      "Xiangjing Hu",
+      "Jingyuan Zhang",
+      "Yating Zhang",
+      "Hui Wang",
+      "Lizhen Qu",
+      "Zenglin Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.193",
+    "point2d": [
+      -2.422203779220581,
+      16.09535789489746
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 195,
+    "title": "A Gradient Control Method for Backdoor Attacks on Parameter-Efficient Tuning",
+    "abstract": "Parameter-Efficient Tuning (PET) has shown remarkable performance by fine-tuning only a small number of parameters of the pre-trained language models (PLMs) for the downstream tasks, while it is also possible to construct backdoor attacks due to the vulnerability of pre-trained weights. However, a large reduction in the number of attackable parameters in PET will cause the user\u2019s fine-tuning to greatly affect the effectiveness of backdoor attacks, resulting in backdoor forgetting. We find that the backdoor injection process can be regarded as multi-task learning, which has a convergence imbalance problem between the training of clean and poisoned data. And this problem might result in forgetting the backdoor. Based on this finding, we propose a gradient control method to consolidate the attack effect, comprising two strategies. One controls the gradient magnitude distribution cross layers within one task and the other prevents the conflict of gradient directions between tasks. Compared with previous backdoor attack methods in the scenario of PET, our method improve the effect of the attack on sentiment classification and spam detection respectively, which shows that our method is widely applicable to different tasks.",
+    "authors": [
+      "Naibin Gu",
+      "Peng Fu",
+      "Xiyu Liu",
+      "Zhengxiao Liu",
+      "Zheng Lin",
+      "Weiping Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.194",
+    "point2d": [
+      -0.8319306969642639,
+      8.550176620483398
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 196,
+    "title": "History Semantic Graph Enhanced Conversational KBQA with Temporal Information Modeling",
+    "abstract": "Context information modeling is an important task in conversational KBQA. However, existing methods usually assume the independence of utterances and model them in isolation. In this paper, we propose a History Semantic Graph Enhanced KBQA model (HSGE) that is able to effectively model long-range semantic dependencies in conversation history while maintaining low computational cost. The framework incorporates a context-aware encoder, which employs a dynamic memory decay mechanism and models context at different levels of granularity. We evaluate HSGE on a widely used benchmark dataset for complex sequential question answering. Experimental results demonstrate that it outperforms existing baselines averaged on all question types.",
+    "authors": [
+      "Hao Sun",
+      "Yang Li",
+      "Liwei Deng",
+      "Bowen Li",
+      "Binyuan Hui",
+      "Binhua Li",
+      "Yunshi Lan",
+      "Yan Zhang",
+      "Yongbin Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.195",
+    "point2d": [
+      21.329750061035156,
+      53.88225173950195
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 197,
+    "title": "From the One, Judge of the Whole: Typed Entailment Graph Construction with Predicate Generation",
+    "abstract": "Entailment Graphs (EGs) have been constructed based on extracted corpora as a strong and explainable form to indicate context-independent entailment relation in natural languages. However, EGs built by previous methods often suffer from the severe sparsity issues, due to limited corpora available and the long-tail phenomenon of predicate distributions. In this paper, we propose a multi-stage method, Typed Predicate-Entailment Graph Generator (TP-EGG), to tackle this problem. Given several seed predicates, TP-EGG builds the graphs by generating new predicates and detecting entailment relations among them. The generative nature of TP-EGG helps us leverage the recent advances from large pretrained language models (PLMs), while avoiding the reliance on carefully prepared corpora. Experiments on benchmark datasets show that TP-EGG can generate high-quality and scale-controllable entailment graphs, achieving significant in-domain improvement over state-of-the-art EGs and boosting the performance of down-stream inference tasks.",
+    "authors": [
+      "Zhibin Chen",
+      "Yansong Feng",
+      "Dongyan Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.196",
+    "point2d": [
+      44.5766487121582,
+      -58.35517501831055
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 198,
+    "title": "Alleviating Over-smoothing for Unsupervised Sentence Representation",
+    "abstract": "Currently, learning better unsupervised sentence representations is the pursuit of many natural language processing communities. Lots of approaches based on pre-trained language models (PLMs) and contrastive learning have achieved promising results on this task. Experimentally, we observe that the over-smoothing problem reduces the capacity of these powerful PLMs, leading to sub-optimal sentence representations. In this paper, we present a Simple method named Self-Contrastive Learning (SSCL) to alleviate this issue, which samples negatives from PLMs intermediate layers, improving the quality of the sentence representation. Our proposed method is quite simple and can be easily extended to various state-of-the-art models for performance boosting, which can be seen as a plug-and-play contrastive framework for learning unsupervised sentence representation. Extensive results prove that SSCL brings the superior performance improvements of different strong baselines (e.g., BERT and SimCSE) on Semantic Textual Similarity and Transfer datasets",
+    "authors": [
+      "Nuo Chen",
+      "Linjun Shou",
+      "Jian Pei",
+      "Ming Gong",
+      "Bowen Cao",
+      "Jianhui Chang",
+      "Jia Li",
+      "Daxin Jiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.197",
+    "point2d": [
+      3.0577552318573,
+      -33.32369613647461
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 199,
+    "title": "Memory-efficient NLLB-200: Language-specific Expert Pruning of a Massively Multilingual Machine Translation Model",
+    "abstract": "The recently released NLLB-200 is a set of multilingual Neural Machine Translation models that cover 202 languages. The largest model is based on a Mixture of Experts architecture and achieves SoTA results across many language pairs. It contains 54.5B parameters and requires at least four 32GB GPUs just for inference.In this work, we propose a pruning method that enables the removal of up to 80% of experts without further finetuning and with a negligible loss in translation quality, which makes it feasible to run the model on a single 32GB GPU. Further analysis suggests that our pruning metrics can identify language-specific experts.",
+    "authors": [
+      "Yeskendir Koishekenov",
+      "Alexandre Berard",
+      "Vassilina Nikoulina"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.198",
+    "point2d": [
+      -60.38713455200195,
+      -9.669428825378418
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 200,
+    "title": "DAMP: Doubly Aligned Multilingual Parser for Task-Oriented Dialogue",
+    "abstract": "Modern virtual assistants use internal semantic parsing engines to convert user utterances to actionable commands. However, prior work has demonstrated multilingual models are less robust for semantic parsing compared to other tasks. In global markets such as India and Latin America, robust multilingual semantic parsing is critical as codeswitching between languages is prevalent for bilingual users. In this work we dramatically improve the zero-shot performance of a multilingual and codeswitched semantic parsing system using two stages of multilingual alignment. First, we show that contrastive alignment pretraining improves both English performance and transfer efficiency. We then introduce a constrained optimization approach for hyperparameter-free adversarial alignment during finetuning. Our Doubly Aligned Multilingual Parser (DAMP) improves mBERT transfer performance by 3x, 6x, and 81x on the Spanglish, Hinglish and Multilingual Task Oriented Parsing benchmarks respectively and outperforms XLM-R and mT5-Large using 3.2x fewer parameters.",
+    "authors": [
+      "William Held",
+      "Christopher Hidey",
+      "Fei Liu",
+      "Eric Zhu",
+      "Rahul Goel",
+      "Diyi Yang",
+      "Rushin Shah"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.199",
+    "point2d": [
+      -32.86288070678711,
+      -55.557498931884766
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 201,
+    "title": "From Characters to Words: Hierarchical Pre-trained Language Model for Open-vocabulary Language Understanding",
+    "abstract": "Current state-of-the-art models for natural language understanding require a preprocessing step to convert raw text into discrete tokens. This process known as tokenization relies on a pre-built vocabulary of words or sub-word morphemes. This fixed vocabulary limits the model\u2019s robustness to spelling errors and its capacity to adapt to new domains. In this work, we introduce a novel open-vocabulary language model that adopts a hierarchical two-level approach: one at the word level and another at the sequence level. Concretely, we design an intra-word module that uses a shallow Transformer architecture to learn word representations from their characters, and a deep inter-word Transformer module that contextualizes each word representation by attending to the entire word sequence. Our model thus directly operates on character sequences with explicit awareness of word boundaries, but without biased sub-word or word-level vocabulary. Experiments on various downstream tasks show that our method outperforms strong baselines. We also demonstrate that our hierarchical model is robust to textual corruption and domain shift.",
+    "authors": [
+      "Li Sun",
+      "Florian Luisier",
+      "Kayhan Batmanghelich",
+      "Dinei Florencio",
+      "Cha Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.200",
+    "point2d": [
+      -33.68099594116211,
+      -32.633033752441406
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 202,
+    "title": "MatSci-NLP: Evaluating Scientific Language Models on Materials Science Language Tasks Using Text-to-Schema Modeling",
+    "abstract": "We present MatSci-NLP, a natural language benchmark for evaluating the performance of natural language processing (NLP) models on materials science text. We construct the benchmark from publicly available materials science text data to encompass seven different NLP tasks, including conventional NLP tasks like named entity recognition and relation classification, as well as NLP tasks specific to materials science, such as synthesis action retrieval which relates to creating synthesis procedures for materials. We study various BERT-based models pretrained on different scientific text corpora on MatSci-NLP to understand the impact of pretraining strategies on understanding materials science text. Given the scarcity of high-quality annotated data in the materials science domain, we perform our fine-tuning experiments with limited training data to encourage the generalize across MatSci-NLP tasks.Our experiments in this low-resource training setting show that language models pretrained on scientific text outperform BERT trained on general text. MatBERT, a model pretrained specifically on materials science journals, generally performs best for most tasks. Moreover, we propose a unified text-to-schema for multitask learning on {pasted macro \u2018BENCHMARK\u2019} and compare its performance with traditional fine-tuning methods. In our analysis of different training methods, we find that our proposed text-to-schema methods inspired by question-answering consistently outperform single and multitask NLP fine-tuning methods. The code and datasets are publicly available https://github.com/BangLab-UdeM-Mila/NLP4MatSci-ACL23.",
+    "authors": [
+      "Yu Song",
+      "Santiago Miret",
+      "Bang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.201",
+    "point2d": [
+      28.947853088378906,
+      -26.444671630859375
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 203,
+    "title": "Code4Struct: Code Generation for Few-Shot Event Structure Prediction",
+    "abstract": "Large Language Model (LLM) trained on a mixture of text and code has demonstrated impressive capability in translating natural language (NL) into structured code.We observe that semantic structures can be conveniently translated into code and propose Code4Struct to leverage such text-to-structure translation capability to tackle structured prediction tasks.As a case study, we formulate Event Argument Extraction (EAE) as converting text into event-argument structures that can be represented as a class object using code.This alignment between structures and code enables us to take advantage of Programming Language (PL) features such as inheritance and type annotation to introduce external knowledge or add constraints.We show that, with sufficient in-context examples, formulating EAE as a code generation problem is advantageous over using variants of text-based prompts.Despite only using 20 training event instances for each event type, Code4Struct is comparable to supervised models trained on 4,202 instances and outperforms current state-of-the-art (SOTA) trained on 20-shot data by 29.5% absolute F1. Code4Struct can use 10-shot training data from a sibling event type to predict arguments for zero-resource event types and outperforms the zero-shot baseline by 12% absolute F1.",
+    "authors": [
+      "Xingyao Wang",
+      "Sha Li",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.202",
+    "point2d": [
+      -8.0728120803833,
+      -50.84513473510742
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 204,
+    "title": "GENEVA: Benchmarking Generalizability for Event Argument Extraction with Hundreds of Event Types and Argument Roles",
+    "abstract": "Recent works in Event Argument Extraction (EAE) have focused on improving model generalizability to cater to new events and domains. However, standard benchmarking datasets like ACE and ERE cover less than 40 event types and 25 entity-centric argument roles. Limited diversity and coverage hinder these datasets from adequately evaluating the generalizability of EAE models. In this paper, we first contribute by creating a large and diverse EAE ontology. This ontology is created by transforming FrameNet, a comprehensive semantic role labeling (SRL) dataset for EAE, by exploiting the similarity between these two tasks. Then, exhaustive human expert annotations are collected to build the ontology, concluding with 115 events and 220 argument roles, with a significant portion of roles not being entities. We utilize this ontology to further introduce GENEVA, a diverse generalizability benchmarking dataset comprising four test suites aimed at evaluating models\u2019 ability to handle limited data and unseen event type generalization. We benchmark six EAE models from various families. The results show that owing to non-entity argument roles, even the best-performing model can only achieve 39% F1 score, indicating how GENEVA provides new challenges for generalization in EAE. Overall, our large and diverse EAE ontology can aid in creating more comprehensive future resources, while GENEVA is a challenging benchmarking dataset encouraging further research for improving generalizability in EAE. The code and data can be found at https://github.com/PlusLabNLP/GENEVA.",
+    "authors": [
+      "Tanmay Parekh",
+      "I-Hung Hsu",
+      "Kuan-Hao Huang",
+      "Kai-Wei Chang",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.203",
+    "point2d": [
+      43.57514572143555,
+      -46.26205062866211
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 205,
+    "title": "Efficient Semiring-Weighted Earley Parsing",
+    "abstract": "We present Earley\u2019s (1970) context-free parsing algorithm as a deduction system, incorporating various known and new speed-ups. In particular, our presentation supports a known worst-case runtime improvement from Earley\u2019s (1970) O(N3|G||R|), which is unworkable for the large grammars that arise in natural language processing, to O(N3|G|), which matches the complexity of CKY on a binarized version of the grammar G. Here N is the length of the sentence, |R| is the number of productions in G, and |G| is the total length of those productions. We also provide a version that achieves runtime of O(N3|M|) with |M| leq |G| when the grammar is represented compactly as a single finite-state automaton M (this is partly novel). We carefully treat the generalization to semiring-weighted deduction, preprocessing the grammar like Stolcke (1995) to eliminate the possibility of deduction cycles, and further generalize Stolcke\u2019s method to compute the weights of sentence prefixes. We also provide implementation details for efficient execution, ensuring that on a preprocessed grammar, the semiring-weighted versions of our methods have the same asymptotic runtime and space requirements as the unweighted methods, including sub-cubic runtime on some grammars.",
+    "authors": [
+      "Andreas Opedal",
+      "Ran Zmigrod",
+      "Tim Vieira",
+      "Ryan Cotterell",
+      "Jason Eisner"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.204",
+    "point2d": [
+      -22.007535934448242,
+      -60.633018493652344
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 206,
+    "title": "Tree-Based Representation and Generation of Natural and Mathematical Language",
+    "abstract": "Mathematical language in scientific communications and educational scenarios is important yet relatively understudied compared to natural languages. Recent works on mathematical language focus either on representing stand-alone mathematical expressions, especially in their natural tree format, or mathematical reasoning in pre-trained natural language models. Existing works on jointly modeling and generating natural and mathematical languages simply treat mathematical expressions as text, without accounting for the rigid structural properties of mathematical expressions. In this paper, we propose a series of modifications to existing language models to jointly represent and generate text and math: representing mathematical expressions as sequences of node tokens in their operator tree format, using math symbol and tree position embeddings to preserve the semantic and structural properties of mathematical expressions, and using a constrained decoding method to generate mathematically valid expressions. We ground our modifications in GPT-2, resulting in a model MathGPT, and demonstrate that it outperforms baselines on mathematical expression generation tasks.",
+    "authors": [
+      "Alexander Scarlatos",
+      "Andrew Lan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.205",
+    "point2d": [
+      43.59143829345703,
+      -23.378355026245117
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 207,
+    "title": "ParaLS: Lexical Substitution via Pretrained Paraphraser",
+    "abstract": "Lexical substitution (LS) aims at finding appropriate substitutes for a target word in a sentence. Recently, LS methods based on pretrained language models have made remarkable progress, generating potential substitutes for a target word through analysis of its contextual surroundings. However, these methods tend to overlook the preservation of the sentence\u2019s meaning when generating the substitutes. This study explores how to generate the substitute candidates from a paraphraser, as the generated paraphrases from a paraphraser contain variations in word choice and preserve the sentence\u2019s meaning. Since we cannot directly generate the substitutes via commonly used decoding strategies, we propose two simple decoding strategies that focus on the variations of the target word during decoding. Experimental results show that our methods outperform state-of-the-art LS methods based on pre-trained language models on three benchmarks.",
+    "authors": [
+      "Jipeng Qiang",
+      "Kang Liu",
+      "Yun Li",
+      "Yunhao Yuan",
+      "Yi Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.206",
+    "point2d": [
+      -11.557607650756836,
+      4.983604431152344
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 208,
+    "title": "Peer-Label Assisted Hierarchical Text Classification",
+    "abstract": "Hierarchical text classification (HTC) is a challenging task, in which the labels of texts can be organized into a category hierarchy. To deal with the HTC problem, many existing works focus on utilizing the parent-child relationships that are explicitly shown in the hierarchy. However, texts with a category hierarchy also have some latent relevancy among labels in the same level of the hierarchy. We refer to these labels as peer labels, from which the peer effects are originally utilized in our work to improve the classification performance. To fully explore the peer-label relationship, we develop a PeerHTC method. This method innovatively measures the latent relevancy of peer labels through several metrics and then encodes the relevancy with a Graph Convolutional Neural Network. We also propose a sample importance learning method to ameliorate the side effects raised by modelling the peer label relevancy. Our experiments on several standard datasets demonstrate the evidence of peer labels and the superiority of PeerHTC over other state-of-the-art HTC methods in terms of classification accuracy.",
+    "authors": [
+      "Junru Song",
+      "Feifei Wang",
+      "Yang Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.207",
+    "point2d": [
+      0.9628188014030457,
+      -23.444490432739258
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 209,
+    "title": "Free Lunch for Efficient Textual Commonsense Integration in Language Models",
+    "abstract": "Recent years have witnessed the emergence of textual commonsense knowledge bases, aimed at providing more nuanced and context-rich knowledge. The integration of external commonsense into language models has been shown to be a key enabler in advancing the state-of-the-art for a wide range of NLP tasks. However, incorporating textual commonsense descriptions is computationally expensive, as compared to encoding conventional symbolic knowledge. In this paper, we propose a method to improve its efficiency without modifying the model. Our idea is to group training samples with similar commonsense descriptions into a single batch, thus reusing the encoded description across multiple samples. We theoretically investigate this problem and demonstrate that its upper bound can be reduced to the classic graph k-cut problem. Consequently, we propose a spectral clustering-based algorithm to solve this problem. Extensive experiments illustrate that the proposed batch partitioning approach effectively reduces the computational cost while preserving performance. The efficiency improvement is more pronounced on larger datasets and on devices with more memory capacity, attesting to its practical utility for large-scale applications.",
+    "authors": [
+      "Wanyun Cui",
+      "Xingran Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.208",
+    "point2d": [
+      -10.740663528442383,
+      -28.15048599243164
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 210,
+    "title": "A Probabilistic Framework for Discovering New Intents",
+    "abstract": "Discovering new intents is of great significance for establishing the Task-Oriented Dialogue System. Most existing methods either cannot transfer prior knowledge contained in known intents or fall into the dilemma of forgetting prior knowledge in the follow-up. Furthermore, these methods do not deeply explore the intrinsic structure of unlabeled data, and as a result, cannot seek out the characteristics that define an intent in general. In this paper, starting from the intuition that discovering intents could be beneficial for identifying known intents, we propose a probabilistic framework for discovering intents where intent assignments are treated as latent variables. We adopt the Expectation Maximization framework for optimization. Specifically, In the E-step, we conduct intent discovery and explore the intrinsic structure of unlabeled data by the posterior of intent assignments. In the M-step, we alleviate the forgetting of prior knowledge transferred from known intents by optimizing the discrimination of labeled data. Extensive experiments conducted on three challenging real-world datasets demonstrate the generality and effectiveness of the proposed framework and implementation.",
+    "authors": [
+      "Yunhua Zhou",
+      "Guofeng Quan",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.209",
+    "point2d": [
+      -7.233643054962158,
+      74.13751983642578
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 211,
+    "title": "MultiTACRED: A Multilingual Version of the TAC Relation Extraction Dataset",
+    "abstract": "Relation extraction (RE) is a fundamental task in information extraction, whose extension to multilingual settings has been hindered by the lack of supervised resources comparable in size to large English datasets such as TACRED (Zhang et al., 2017). To address this gap, we introduce the MultiTACRED dataset, covering 12 typologically diverse languages from 9 language families, which is created by machine-translating TACRED instances and automatically projecting their entity annotations. We analyze translation and annotation projection quality, identify error categories, and experimentally evaluate fine-tuned pretrained mono- and multilingual language models in common transfer learning scenarios. Our analyses show that machine translation is a viable strategy to transfer RE instances, with native speakers judging more than 83% of the translated instances to be linguistically and semantically acceptable. We find monolingual RE model performance to be comparable to the English original for many of the target languages, and that multilingual models trained on a combination of English and target language data can outperform their monolingual counterparts. However, we also observe a variety of translation and annotation projection errors, both due to the MT systems and linguistic features of the target languages, such as pronoun-dropping, compounding and inflection, that degrade dataset quality and RE model performance.",
+    "authors": [
+      "Leonhard Hennig",
+      "Philippe Thomas",
+      "Sebastian M\u00f6ller"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.210",
+    "point2d": [
+      44.07630157470703,
+      -63.36857604980469
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 212,
+    "title": "Towards Higher Pareto Frontier in Multilingual Machine Translation",
+    "abstract": "Multilingual neural machine translation has witnessed remarkable progress in recent years. However, the long-tailed distribution of multilingual corpora poses a challenge of Pareto optimization, i.e., optimizing for some languages may come at the cost of degrading the performance of others.Existing balancing training strategies are equivalent to a series of Pareto optimal solutions, which trade off on a Pareto frontierIn Pareto optimization, Pareto optimal solutions refer to solutions in which none of the objectives can be improved without sacrificing at least one of the other objectives. The set of all Pareto optimal solutions forms a Pareto frontier..In this work, we propose a new training framework, Pareto Mutual Distillation (Pareto-MD), towards pushing the Pareto frontier outwards rather than making trade-offs.Specifically, Pareto-MD collaboratively trains two Pareto optimal solutions that favor different languages and allows them to learn from the strengths of each other via knowledge distillation.Furthermore, we introduce a novel strategy to enable stronger communication between Pareto optimal solutions and broaden the applicability of our approach. Experimental results on the widely-used WMT and TED datasets show that our method significantly pushes the Pareto frontier and outperforms baselines by up to +2.46 BLEUOur code will be released upon acceptance..",
+    "authors": [
+      "Yichong Huang",
+      "Xiaocheng Feng",
+      "Xinwei Geng",
+      "Baohang Li",
+      "Bing Qin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.211",
+    "point2d": [
+      -62.219242095947266,
+      -10.035534858703613
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 213,
+    "title": "Small Pre-trained Language Models Can be Fine-tuned as Large Models via Over-Parameterization",
+    "abstract": "By scaling the model size, large pre-trained language models (PLMs) have shown remarkable performance in various natural language processing tasks, mostly outperforming small PLMs by a large margin.However, due to the high computational cost, the huge number of parameters also restricts the applicability of large PLMs in real-world systems. In this paper, we focus on scaling up the parameters of PLMs only during fine-tuning, to benefit from the over-parameterization, while without increasing the inference latency. Given a relatively small PLM, we over-parameterize it by employing a matrix product operator, an efficient and almost lossless decomposition method to factorize its contained parameter matrices into a set of higher-dimensional tensors.Considering the efficiency, we further propose both static and dynamic strategies to select the most important parameter matrices for over-parameterization.Extensive experiments have demonstrated that our approach can significantly boost the fine-tuning performance of small PLMs and even help small PLMs outperform 3\\times parameterized larger ones.Our code is publicly available at https://github.com/zfgao66/OPF.",
+    "authors": [
+      "Ze-Feng Gao",
+      "Kun Zhou",
+      "Peiyu Liu",
+      "Wayne Xin Zhao",
+      "Ji-Rong Wen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.212",
+    "point2d": [
+      -35.724910736083984,
+      -18.996583938598633
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 214,
+    "title": "Entity Tracking in Language Models",
+    "abstract": "Keeping track of how states of entities change as a text or dialog unfolds is a key prerequisite to discourse understanding. Yet, there have been few systematic investigations into the ability of large language models (LLMs) to track discourse entities. In this work, we present a task probing to what extent a language model can infer the final state of an entity given an English description of the initial state and a series of state-changing operations. We use this task to first investigate whether Flan-T5, GPT-3 and GPT-3.5 can track the state of entities, and find that only GPT-3.5 models, which have been pretrained on large amounts of code, exhibit this ability. We then investigate whether smaller models pretrained primarily on text can learn to track entities, through finetuning T5 on several training/evaluation splits. While performance degrades for more complex splits, we find that even when evaluated on a different set of entities from training or longer operation sequences, a finetuned model can perform non-trivial entity tracking. Taken together, these results suggest that language models can learn to track entities but pretraining on text corpora alone does not make this capacity surface.",
+    "authors": [
+      "Najoung Kim",
+      "Sebastian Schuster"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.213",
+    "point2d": [
+      44.35730743408203,
+      3.522099018096924
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 215,
+    "title": "A Textual Dataset for Situated Proactive Response Selection",
+    "abstract": "Recent data-driven conversational models are able to return fluent, consistent, and informative responses to many kinds of requests and utterances in task-oriented scenarios.However, these responses are typically limited to just the immediate local topic instead of being wider-ranging and proactively taking the conversation further, for example making suggestions to help customers achieve their goals. This inadequacy reflects a lack of understanding of the interlocutor\u2019s situation and implicit goal. To address the problem, we introduce a task of proactive response selection based on situational information. We present a manually-curated dataset of 1.7k English conversation examples that include situational background information plus for each conversation a set of responses, only some of which are acceptable in the situation. A responsive and informed conversation system should select the appropriate responses and avoid inappropriate ones; doing so demonstrates the ability to adequately understand the initiating request and situation. Our benchmark experiments show that this is not an easy task even for strong neural models, offering opportunities for future research.",
+    "authors": [
+      "Naoki Otani",
+      "Jun Araki",
+      "HyeongSik Kim",
+      "Eduard Hovy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.214",
+    "point2d": [
+      17.67087173461914,
+      63.01125717163086
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 216,
+    "title": "DiffusionNER: Boundary Diffusion for Named Entity Recognition",
+    "abstract": "In this paper, we propose DiffusionNER, which formulates the named entity recognition task as a boundary-denoising diffusion process and thus generates named entities from noisy spans. During training, DiffusionNER gradually adds noises to the golden entity boundaries by a fixed forward diffusion process and learns a reverse diffusion process to recover the entity boundaries. In inference, DiffusionNER first randomly samples some noisy spans from a standard Gaussian distribution and then generates the named entities by denoising them with the learned reverse diffusion process. The proposed boundary-denoising diffusion process allows progressive refinement and dynamic sampling of entities, empowering DiffusionNER with efficient and flexible entity generation capability. Experiments on multiple flat and nested NER datasets demonstrate that DiffusionNER achieves comparable or even better performance than previous state-of-the-art models.",
+    "authors": [
+      "Yongliang Shen",
+      "Kaitao Song",
+      "Xu Tan",
+      "Dongsheng Li",
+      "Weiming Lu",
+      "Yueting Zhuang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.215",
+    "point2d": [
+      33.63534164428711,
+      -86.26883697509766
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 217,
+    "title": "WACO: Word-Aligned Contrastive Learning for Speech Translation",
+    "abstract": "End-to-end Speech Translation (E2E ST) aims to directly translate source speech into target text. Existing ST methods perform poorly when only extremely small speech-text data are available for training. We observe that an ST model\u2019s performance closely correlates with its embedding similarity between speech and source transcript. In this paper, we propose Word-Aligned COntrastive learning (WACO), a simple and effective method for extremely low-resource speech-to-text translation. Our key idea is bridging word-level representations for both speech and text modalities via contrastive learning. We evaluate WACO and other methods on the MuST-C dataset, a widely used ST benchmark, and on a low-resource direction Maltese-English from IWSLT 2023. Our experiments demonstrate that WACO outperforms the best baseline by 9+ BLEU points with only 1-hour parallel ST data. Code is available at https://github.com/owaski/WACO.",
+    "authors": [
+      "Siqi Ouyang",
+      "Rong Ye",
+      "Lei Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.216",
+    "point2d": [
+      -68.57369995117188,
+      19.294254302978516
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 218,
+    "title": "Cross-lingual Continual Learning",
+    "abstract": "The longstanding goal of multi-lingual learning has been to develop a universal cross-lingual model that can withstand the changes in multi-lingual data distributions. There has been a large amount of work to adapt such multi-lingual models to unseen target languages. However, the majority of work in this direction focuses on the standard one-hop transfer learning pipeline from source to target languages, whereas in realistic scenarios, new languages can be incorporated at any time in a sequential manner. In this paper, we present a principled Cross-lingual Continual Learning (CCL) evaluation paradigm, where we analyze different categories of approaches used to continually adapt to emerging data from different languages. We provide insights into what makes multilingual sequential learning particularly challenging. To surmount such challenges, we benchmark a representative set of cross-lingual continual learning algorithms and analyze their knowledge preservation, accumulation, and generalization capabilities compared to baselines on carefully curated datastreams. The implications of this analysis include a recipe for how to measure and balance different cross-lingual continual learning desiderata, which go beyond conventional transfer learning.",
+    "authors": [
+      "Meryem M\u2019hamdi",
+      "Xiang Ren",
+      "Jonathan May"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.217",
+    "point2d": [
+      -57.91334533691406,
+      -19.125911712646484
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 219,
+    "title": "Faithful Question Answering with Monte-Carlo Planning",
+    "abstract": "Although large language models demonstrate remarkable question-answering performances, revealing the intermediate reasoning steps that the models faithfully follow remains challenging. In this paper, we propose FAME (FAithful question answering with MontE-carlo planning) to answer questions based on faithful reasoning steps. The reasoning steps are organized as a structured entailment tree, which shows how premises are used to produce intermediate conclusions that can prove the correctness of the answer. We formulate the task as a discrete decision-making problem and solve it through the interaction of a reasoning environment and a controller. The environment is modular and contains several basic task-oriented modules, while the controller proposes actions to assemble the modules. Since the search space could be large, we introduce a Monte-Carlo planning algorithm to do a look-ahead search and select actions that will eventually lead to high-quality steps. FAME achieves advanced performance on the standard benchmark. It can produce valid and faithful reasoning steps compared with large language models with a much smaller model size.",
+    "authors": [
+      "Ruixin Hong",
+      "Hongming Zhang",
+      "Hong Zhao",
+      "Dong Yu",
+      "Changshui Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.218",
+    "point2d": [
+      51.53790283203125,
+      -13.386573791503906
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 220,
+    "title": "Unbalanced Optimal Transport for Unbalanced Word Alignment",
+    "abstract": "Monolingual word alignment is crucial to model semantic interactions between sentences.In particular, null alignment, a phenomenon in which words have no corresponding counterparts, is pervasive and critical in handling semantically divergent sentences. Identification of null alignment is useful on its own to reason about the semantic similarity of sentences by indicating there exists information inequality. To achieve unbalanced word alignment that values both alignment and null alignment, this study shows that the family of optimal transport (OT), i.e., balanced, partial, and unbalanced OT, are natural and powerful approaches even without tailor-made techniques.Our extensive experiments covering unsupervised and supervised settings indicate that our generic OT-based alignment methods are competitive against the state-of-the-arts specially designed for word alignment, remarkably on challenging datasets with high null alignment frequencies.",
+    "authors": [
+      "Yuki Arase",
+      "Han Bao",
+      "Sho Yokoi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.219",
+    "point2d": [
+      -57.38597106933594,
+      4.914487361907959
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 221,
+    "title": "Guiding Computational Stance Detection with Expanded Stance Triangle Framework",
+    "abstract": "Stance detection determines whether the author of a piece of text is in favor of, against, or neutral towards a specified target, and can be used to gain valuable insights into social media. The ubiquitous indirect referral of targets makes this task challenging, as it requires computational solutions to model semantic features and infer the corresponding implications from a literal statement. Moreover, the limited amount of available training data leads to subpar performance in out-of-domain and cross-target scenarios, as data-driven approaches are prone to rely on superficial and domain-specific features.In this work, we decompose the stance detection task from a linguistic perspective, and investigate key components and inference paths in this task. The stance triangle is a generic linguistic framework previously proposed to describe the fundamental ways people express their stance. We further expand it by characterizing the relationship between explicit and implicit objects. We then use the framework to extend one single training corpus with additional annotation. Experimental results show that strategically-enriched data can significantly improve the performance on out-of-domain and cross-target evaluation.",
+    "authors": [
+      "Zhengyuan Liu",
+      "Yong Keong Yap",
+      "Hai Leong Chieu",
+      "Nancy Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.220",
+    "point2d": [
+      48.74433898925781,
+      38.49042892456055
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 222,
+    "title": "Analyzing and Reducing the Performance Gap in Cross-Lingual Transfer with Fine-tuning Slow and Fast",
+    "abstract": "Existing research has shown that a multilingual pre-trained language model fine-tuned with one (source) language also performs well on downstream tasks for non-source languages, even though no fine-tuning is done on these languages. However, there is a clear gap between the performance of the source language and that of the non-source languages. This paper analyzes the fine-tuning process, discovers when the performance gap changes and identifies which network weights affect the overall performance most. Additionally, the paper seeks to answer to what extent the gap can be reduced by reducing forgetting. Based on the analysis results, a method named Fine-tuning slow and fast with four training policies is proposed to address these issues. Experimental results show the proposed method outperforms baselines by a clear margin.",
+    "authors": [
+      "Yiduo Guo",
+      "Yaobo Liang",
+      "Dongyan Zhao",
+      "Bing Liu",
+      "Nan Duan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.221",
+    "point2d": [
+      -58.53397750854492,
+      -17.321104049682617
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 223,
+    "title": "Improving Self-training for Cross-lingual Named Entity Recognition with Contrastive and Prototype Learning",
+    "abstract": "In cross-lingual named entity recognition (NER), self-training is commonly used to bridge the linguistic gap by training on pseudo-labeled target-language data. However, due to sub-optimal performance on target languages, the pseudo labels are often noisy and limit the overall performance. In this work, we aim to improve self-training for cross-lingual NER by combining representation learning and pseudo label refinement in one coherent framework.Our proposed method, namely ContProto mainly comprises two components: (1) contrastive self-training and (2) prototype-based pseudo-labeling. Our contrastive self-training facilitates span classification by separating clusters of different classes, and enhances cross-lingual transferability by producing closely-aligned representations between the source and target language. Meanwhile, prototype-based pseudo-labeling effectively improves the accuracy of pseudo labels during training. We evaluate ContProto on multiple transfer pairs, and experimental results show our method brings substantial improvements over current state-of-the-art methods.",
+    "authors": [
+      "Ran Zhou",
+      "Xin Li",
+      "Lidong Bing",
+      "Erik Cambria",
+      "Chunyan Miao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.222",
+    "point2d": [
+      30.33837890625,
+      -87.12718963623047
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 224,
+    "title": "MM-SHAP: A Performance-agnostic Metric for Measuring Multimodal Contributions in Vision and Language Models & Tasks",
+    "abstract": "Vision and language models (VL) are known to exploit unrobust indicators in individual modalities (e.g., introduced by distributional biases) instead of focusing on relevant information in each modality. That a unimodal model achieves similar accuracy on a VL task to a multimodal one, indicates that so-called unimodal collapse occurred. However, accuracy-based tests fail to detect e.g., when the model prediction is wrong, while the model used relevant information from a modality.Instead, we propose MM-SHAP, a performance-agnostic multimodality score based on Shapley values that reliably quantifies in which proportions a multimodal model uses individual modalities. We apply MM-SHAP in two ways: (1) to compare models for their average degree of multimodality, and (2) to measure for individual models the contribution of individual modalities for different tasks and datasets.Experiments with six VL models \u2013 LXMERT, CLIP and four ALBEF variants \u2013 on four VL tasks highlight that unimodal collapse can occur to different degrees and in different directions, contradicting the wide-spread assumption that unimodal collapse is one-sided. Based on our results, we recommend MM-SHAP for analysing multimodal tasks, to diagnose and guide progress towards multimodal integration. Code available at https://github.com/Heidelberg-NLP/MM-SHAP.",
+    "authors": [
+      "Letitia Parcalabescu",
+      "Anette Frank"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.223",
+    "point2d": [
+      -59.6572265625,
+      31.834070205688477
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 225,
+    "title": "Towards Boosting the Open-Domain Chatbot with Human Feedback",
+    "abstract": "Many open-domain dialogue models pre-trained with social media comments can generate coherent replies but have difficulties producing engaging responses. This phenomenon might mainly result from the deficiency of annotated human-human conversations and the misalignment with human preference. In this paper, we propose a novel and efficient framework Diamante to boost the open-domain chatbot, where two kinds of human feedback (including explicit demonstration and implicit preference) are collected and leveraged. By asking annotators to select or amend the model-generated candidate responses, Diamante efficiently collects the human demonstrated responses and constructs a Chinese chit-chat dataset. To enhance the alignment with human preference, Diamante leverages the implicit preference in the data collection process and introduces the generation-evaluation joint training. Comprehensive experiments indicate that the Diamante dataset and joint training paradigm can significantly boost the performance of pre-trained dialogue models. The overall engagingness of the previous state-of-the-art model has been improved remarkably by 50% in Chinese open-domain conversations.",
+    "authors": [
+      "Hua Lu",
+      "Siqi Bao",
+      "Huang He",
+      "Fan Wang",
+      "Hua Wu",
+      "Haifeng Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.224",
+    "point2d": [
+      14.418460845947266,
+      67.68600463867188
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 226,
+    "title": "Knowledge-enhanced Mixed-initiative Dialogue System for Emotional Support Conversations",
+    "abstract": "Unlike empathetic dialogues, the system in emotional support conversations (ESC) is expected to not only convey empathy for comforting the help-seeker, but also proactively assist in exploring and addressing their problems during the conversation. In this work, we study the problem of mixed-initiative ESC where the user and system can both take the initiative in leading the conversation. Specifically, we conduct a novel analysis on mixed-initiative ESC systems with a tailor-designed schema that divides utterances into different types with speaker roles and initiative types. Four emotional support metrics are proposed to evaluate the mixed-initiative interactions. The analysis reveals the necessity and challenges of building mixed-initiative ESC systems. In the light of this, we propose a knowledge-enhanced mixed-initiative framework (KEMI) for ESC, which retrieves actual case knowledge from a large-scale mental health knowledge graph for generating mixed-initiative responses. Experimental results on two ESC datasets show the superiority of KEMI in both content-preserving evaluation and mixed initiative related analyses.",
+    "authors": [
+      "Yang Deng",
+      "Wenxuan Zhang",
+      "Yifei Yuan",
+      "Wai Lam"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.225",
+    "point2d": [
+      26.948373794555664,
+      68.1500244140625
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 227,
+    "title": "UTC-IE: A Unified Token-pair Classification Architecture for Information Extraction",
+    "abstract": "Information Extraction (IE) spans several tasks with different output structures, such as named entity recognition, relation extraction and event extraction. Previously, those tasks were solved with different models because of diverse task output structures. Through re-examining IE tasks, we find that all of them can be interpreted as extracting spans and span relations. They can further be decomposed into token-pair classification tasks by using the start and end token of a span to pinpoint the span, and using the start-to-start and end-to-end token pairs of two spans to determine the relation. Based on the reformulation, we propose a Unified Token-pair Classification architecture for Information Extraction (UTC-IE), where we introduce Plusformer on top of the token-pair feature matrix. Specifically, it models axis-aware interaction with plus-shaped self-attention and local interaction with Convolutional Neural Network over token pairs. Experiments show that our approach outperforms task-specific and unified models on all tasks in 10 datasets, and achieves better or comparable results on 2 joint IE datasets. Moreover, UTC-IE speeds up over state-of-the-art models on IE tasks significantly in most datasets, which verifies the effectiveness of our architecture.",
+    "authors": [
+      "Hang Yan",
+      "Yu Sun",
+      "Xiaonan Li",
+      "Yunhua Zhou",
+      "Xuanjing Huang",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.226",
+    "point2d": [
+      33.19255065917969,
+      -60.38650894165039
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 228,
+    "title": "Social-Group-Agnostic Bias Mitigation via the Stereotype Content Model",
+    "abstract": "Existing bias mitigation methods require social-group-specific word pairs (e.g., \u201cman\u201d \u2013 \u201cwoman\u201d) for each social attribute (e.g., gender), restricting the bias mitigation to only one specified social attribute. Further, this constraint renders such methods impractical and costly for mitigating bias in understudied and/or unmarked social groups. We propose that the Stereotype Content Model (SCM) \u2014 a theoretical framework developed in social psychology for understanding the content of stereotyping \u2014 can help debiasing efforts to become social-group-agnostic by capturing the underlying connection between bias and stereotypes. SCM proposes that the content of stereotypes map to two psychological dimensions of warmth and competence. Using only pairs of terms for these two dimensions (e.g., warmth: \u201cgenuine\u201d \u2013 \u201cfake\u201d; competence: \u201csmart\u201d \u2013 \u201cstupid\u201d), we perform debiasing with established methods on both pre-trained word embeddings and large language models. We demonstrate that our social-group-agnostic, SCM-based debiasing technique performs comparably to group-specific debiasing on multiple bias benchmarks, but has theoretical and practical advantages over existing approaches.",
+    "authors": [
+      "Ali Omrani",
+      "Alireza Salkhordeh Ziabari",
+      "Charles Yu",
+      "Preni Golazizian",
+      "Brendan Kennedy",
+      "Mohammad Atari",
+      "Heng Ji",
+      "Morteza Dehghani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.227",
+    "point2d": [
+      21.54753303527832,
+      32.958106994628906
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 229,
+    "title": "Revisiting the Gold Standard: Grounding Summarization Evaluation with Robust Human Evaluation",
+    "abstract": "Human evaluation is the foundation upon which the evaluation of both summarization systems and automatic metrics rests. However, existing human evaluation studies for summarization either exhibit a low inter-annotator agreement or have insufficient scale, and an in-depth analysis of human evaluation is lacking. Therefore, we address the shortcomings of existing summarization evaluation along the following axes: (1) We propose a modified summarization salience protocol, Atomic Content Units (ACUs), which is based on fine-grained semantic units and allows for a high inter-annotator agreement. (2) We curate the Robust Summarization Evaluation (RoSE) benchmark, a large human evaluation dataset consisting of 22,000 summary-level annotations over 28 top-performing systems on three datasets. (3) We conduct a comparative study of four human evaluation protocols, underscoring potential confounding factors in evaluation setups. (4) We evaluate 50 automatic metrics and their variants using the collected human annotations across evaluation protocols and demonstrate how our benchmark leads to more statistically stable and significant results. The metrics we benchmarked include recent methods based on large language models (LLMs), GPTScore and G-Eval. Furthermore, our findings have important implications for evaluating LLMs, as we show that LLMs adjusted by human feedback (e.g., GPT-3.5) may overfit unconstrained human evaluation, which is affected by the annotators\u2019 prior, input-agnostic preferences, calling for more robust, targeted evaluation methods.",
+    "authors": [
+      "Yixin Liu",
+      "Alex Fabbri",
+      "Pengfei Liu",
+      "Yilun Zhao",
+      "Linyong Nan",
+      "Ruilin Han",
+      "Simeng Han",
+      "Shafiq Joty",
+      "Chien-Sheng Wu",
+      "Caiming Xiong",
+      "Dragomir Radev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.228",
+    "point2d": [
+      -8.737096786499023,
+      38.774593353271484
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 230,
+    "title": "FIREBALL: A Dataset of Dungeons and Dragons Actual-Play with Structured Game State Information",
+    "abstract": "Dungeons & Dragons (D&D) is a tabletop roleplaying game with complex natural language interactions between players and hidden state information.Recent work has shown that large language models (LLMs) that have access to state information can generate higher quality game turns than LLMs that use dialog history alone.However, previous work used game state information that was heuristically created and was not a true gold standard game state. We present FIREBALL, a large dataset containing nearly 25,000 unique sessions from real D&D gameplay on Discord with true game state info. We recorded game play sessions of players who used the Avrae bot, which was developed to aid people in playing D&D online, capturing language, game commands and underlying game state information. We demonstrate that FIREBALL can improve natural language generation (NLG) by using Avrae state information, improving both automated metrics and human judgments of quality.Additionally, we show that LLMs can generate executable Avrae commands, particularly after finetuning.",
+    "authors": [
+      "Andrew Zhu",
+      "Karmanya Aggarwal",
+      "Alexander Feng",
+      "Lara Martin",
+      "Chris Callison-Burch"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.229",
+    "point2d": [
+      30.30005645751953,
+      54.63036346435547
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 231,
+    "title": "A fine-grained comparison of pragmatic language understanding in humans and language models",
+    "abstract": "Pragmatics and non-literal language understanding are essential to human communication, and present a long-standing challenge for artificial language models. We perform a fine-grained comparison of language models and humans on seven pragmatic phenomena, using zero-shot prompting on an expert-curated set of English materials. We ask whether models (1) select pragmatic interpretations of speaker utterances, (2) make similar error patterns as humans, and (3) use similar linguistic cues as humans to solve the tasks. We find that the largest models achieve high accuracy and match human error patterns: within incorrect responses, models favor literal interpretations over heuristic-based distractors. We also find preliminary evidence that models and humans are sensitive to similar linguistic cues. Our results suggest that pragmatic behaviors can emerge in models without explicitly constructed representations of mental states. However, models tend to struggle with phenomena relying on social expectation violations.",
+    "authors": [
+      "Jennifer Hu",
+      "Sammy Floyd",
+      "Olessia Jouravlev",
+      "Evelina Fedorenko",
+      "Edward Gibson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.230",
+    "point2d": [
+      42.523319244384766,
+      -7.334750652313232
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 232,
+    "title": "Counterfactual Multihop QA: A Cause-Effect Approach for Reducing Disconnected Reasoning",
+    "abstract": "Multi-hop QA requires reasoning over multiple supporting facts to answer the question. However, the existing QA models always rely on shortcuts, e.g., providing the true answer by only one fact, rather than multi-hop reasoning, which is referred as disconnected reasoning problem. To alleviate this issue, we propose a novel counterfactual multihop QA, a causal-effect approach that enables to reduce the disconnected reasoning. It builds upon explicitly modeling of causality: 1) the direct causal effects of disconnected reasoning and 2) the causal effect of true multi-hop reasoning from the total causal effect. With the causal graph, a counterfactual inference is proposed to disentangle the disconnected reasoning from the total causal effect, which provides us a new perspective and technology to learn a QA model that exploits the true multi-hop reasoning instead of shortcuts. Extensive experiments have been conducted on the benchmark HotpotQA dataset, which demonstrate that the proposed method can achieve notable improvement on reducing disconnected reasoning. For example, our method achieves 5.8% higher points of its Supps score on HotpotQA through true multihop reasoning. The code is available at https://github.com/guowzh/CFMQA.",
+    "authors": [
+      "Wangzhen Guo",
+      "Qinkang Gong",
+      "Yanghui Rao",
+      "Hanjiang Lai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.231",
+    "point2d": [
+      64.54647827148438,
+      -9.697622299194336
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 233,
+    "title": "Causal-Debias: Unifying Debiasing in Pretrained Language Models and Fine-tuning via Causal Invariant Learning",
+    "abstract": "Demographic biases and social stereotypes are common in pretrained language models (PLMs), and a burgeoning body of literature focuses on removing the unwanted stereotypical associations from PLMs. However, when fine-tuning these bias-mitigated PLMs in downstream natural language processing (NLP) applications, such as sentiment classification, the unwanted stereotypical associations resurface or even get amplified. Since pretrain&fine-tune is a major paradigm in NLP applications, separating the debiasing procedure of PLMs from fine-tuning would eventually harm the actual downstream utility. In this paper, we propose a unified debiasing framework Causal-Debias to remove unwanted stereotypical associations in PLMs during fine-tuning. Specifically, CausalDebias mitigates bias from a causal invariant perspective by leveraging the specific downstream task to identify bias-relevant and labelrelevant factors. We propose that bias-relevant factors are non-causal as they should have little impact on downstream tasks, while labelrelevant factors are causal. We perform interventions on non-causal factors in different demographic groups and design an invariant risk minimization loss to mitigate bias while maintaining task performance. Experimental results on three downstream tasks show that our proposed method can remarkably reduce unwanted stereotypical associations after PLMs are finetuned, while simultaneously minimizing the impact on PLMs and downstream applications.",
+    "authors": [
+      "Fan Zhou",
+      "Yuzhou Mao",
+      "Liu Yu",
+      "Yi Yang",
+      "Ting Zhong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.232",
+    "point2d": [
+      14.281163215637207,
+      27.37510108947754
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 234,
+    "title": "Parameter-Efficient Fine-Tuning without Introducing New Latency",
+    "abstract": "Parameter-efficient fine-tuning (PEFT) of pre-trained language models has recently demonstrated remarkable achievements, effectively matching the performance of full fine-tuning while utilizing significantly fewer trainable parameters, and consequently addressing the storage and communication constraints. Nonetheless, various PEFT methods are limited by their inherent characteristics. In the case of sparse fine-tuning, which involves modifying only a small subset of the existing parameters, the selection of fine-tuned parameters is task- and domain-specific, making it unsuitable for federated learning. On the other hand, PEFT methods with adding new parameters typically introduce additional inference latency. In this paper, we demonstrate the feasibility of generating a sparse mask in a task-agnostic manner, wherein all downstream tasks share a common mask. Our approach, which relies solely on the magnitude information of pre-trained parameters, surpasses existing methodologies by a significant margin when evaluated on the GLUE benchmark. Additionally, we introduce a novel adapter technique that directly applies the adapter to pre-trained parameters instead of the hidden representation, thereby achieving identical inference speed to that of full fine-tuning. Through extensive experiments, our proposed method attains a new state-of-the-art outcome in terms of both performance and storage efficiency, storing only 0.03% parameters of full fine-tuning.",
+    "authors": [
+      "Baohao Liao",
+      "Yan Meng",
+      "Christof Monz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.233",
+    "point2d": [
+      -37.36198043823242,
+      -15.987092971801758
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 235,
+    "title": "MANNER: A Variational Memory-Augmented Model for Cross Domain Few-Shot Named Entity Recognition",
+    "abstract": "This paper focuses on the task of cross domain few-shot named entity recognition (NER), which aims to adapt the knowledge learned from source domain to recognize named entities in target domain with only a few labeled examples. To address this challenging task, we propose MANNER, a variational memory-augmented few-shot NER model. Specifically, MANNER uses a memory module to store information from the source domain and then retrieve relevant information from the memory to augment few-shot task in the target domain. In order to effectively utilize the information from memory, MANNER uses optimal transport to retrieve and process information from memory, which can explicitly adapt the retrieved information from source domain to target domain and improve the performance in the cross domain few-shot setting. We conduct experiments on English and Chinese cross domain few-shot NER datasets, and the experimental results demonstrate that MANNER can achieve superior performance.",
+    "authors": [
+      "Jinyuan Fang",
+      "Xiaobin Wang",
+      "Zaiqiao Meng",
+      "Pengjun Xie",
+      "Fei Huang",
+      "Yong Jiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.234",
+    "point2d": [
+      32.27467727661133,
+      -87.74246215820312
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 236,
+    "title": "MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages",
+    "abstract": "We present the MASSIVE dataset\u2013Multilingual Amazon Slu resource package (SLURP) for Slot-filling, Intent classification, and Virtual assistant Evaluation. MASSIVE contains 1M realistic, parallel, labeled virtual assistant utterances spanning 51 languages, 18 domains, 60 intents, and 55 slots. MASSIVE was created by tasking professional translators to localize the English-only SLURP dataset into 50 typologically diverse languages from 29 genera. We also present modeling results on XLM-R and mT5, including exact match accuracy, intent classification accuracy, and slot-filling F1 score. We have released our dataset, modeling code, and models publicly.",
+    "authors": [
+      "Jack FitzGerald",
+      "Christopher Hench",
+      "Charith Peris",
+      "Scott Mackie",
+      "Kay Rottmann",
+      "Ana Sanchez",
+      "Aaron Nash",
+      "Liam Urbach",
+      "Vishesh Kakarala",
+      "Richa Singh",
+      "Swetha Ranganath",
+      "Laurie Crist",
+      "Misha Britan",
+      "Wouter Leeuwis",
+      "Gokhan Tur",
+      "Prem Natarajan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.235",
+    "point2d": [
+      -24.245023727416992,
+      -38.67815017700195
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 237,
+    "title": "Distilling Script Knowledge from Large Language Models for Constrained Language Planning",
+    "abstract": "In everyday life, humans often plan their actions by following step-by-step instructions in the form of goal-oriented scripts. Previous work has exploited language models (LMs) to plan for abstract goals of stereotypical activities (e.g., \u201cmake a cake\u201d), but leaves more specific goals with multi-facet constraints understudied (e.g., \u201cmake a cake for diabetics\u201d). In this paper, we define the task of constrained language planning for the first time. We propose an over-generate-then-filter approach to improve large language models (LLMs) on this task, and use it to distill a novel constrained language planning dataset, Coscript, which consists of 55,000 scripts. Empirical results demonstrate that our method significantly improves the constrained language planning ability of LLMs, especially on constraint faithfulness. Furthermore, Coscript is demonstrated to be quite effective in endowing smaller LMs with constrained language planning ability.",
+    "authors": [
+      "Siyu Yuan",
+      "Jiangjie Chen",
+      "Ziquan Fu",
+      "Xuyang Ge",
+      "Soham Shah",
+      "Charles Jankowski",
+      "Yanghua Xiao",
+      "Deqing Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.236",
+    "point2d": [
+      54.55917739868164,
+      -10.310474395751953
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 238,
+    "title": "RED<sup>FM</sup>: a Filtered and Multilingual Relation Extraction Dataset",
+    "abstract": "Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English.In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems.First, we present SRED<sup>FM</sup>, an automatically annotated dataset covering 18 languages, 400 relation types, 13 entity types, totaling more than 40 million triplet instances. Second, we propose RED<sup>FM</sup>, a smaller, human-revised dataset for seven languages that allows for the evaluation of multilingual RE systems. To demonstrate the utility of these novel datasets, we experiment with the first end-to-end multilingual RE model, mREBEL, that extracts triplets, including entity types, in multiple languages. We release our resources and model checkpoints at [https://www.github.com/babelscape/rebel](https://www.github.com/babelscape/rebel).",
+    "authors": [
+      "\u202aPere-Llu\u00eds Huguet Cabot",
+      "Simone Tedeschi",
+      "Axel-Cyrille Ngonga Ngomo",
+      "Roberto Navigli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.237",
+    "point2d": [
+      43.76872253417969,
+      -63.080718994140625
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 239,
+    "title": "Modeling Appropriate Language in Argumentation",
+    "abstract": "Online discussion moderators must make ad-hoc decisions about whether the contributions of discussion participants are appropriate or should be removed to maintain civility. Existing research on offensive language and the resulting tools cover only one aspect among many involved in such decisions. The question of what is considered appropriate in a controversial discussion has not yet been systematically addressed. In this paper, we operationalize appropriate language in argumentation for the first time. In particular, we model appropriateness through the absence of flaws, grounded in research on argument quality assessment, especially in aspects from rhetoric. From these, we derive a new taxonomy of 14 dimensions that determine inappropriate language in online discussions. Building on three argument quality corpora, we then create a corpus of 2191 arguments annotated for the 14 dimensions. Empirical analyses support that the taxonomy covers the concept of appropriateness comprehensively, showing several plausible correlations with argument quality dimensions. Moreover, results of baseline approaches to assessing appropriateness suggest that all dimensions can be modeled computationally on the corpus.",
+    "authors": [
+      "Timon Ziegenbein",
+      "Shahbaz Syed",
+      "Felix Lange",
+      "Martin Potthast",
+      "Henning Wachsmuth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.238",
+    "point2d": [
+      41.3043327331543,
+      39.664329528808594
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 240,
+    "title": "CELDA: Leveraging Black-box Language Model as Enhanced Classifier without Labels",
+    "abstract": "Utilizing language models (LMs) without internal access is becoming an attractive paradigm in the field of NLP as many cutting-edge LMs are released through APIs and boast a massive scale.The de-facto method in this type of black-box scenario is known as prompting, which has shown progressive performance enhancements in situations where data labels are scarce or unavailable.Despite their efficacy, they still fall short in comparison to fully supervised counterparts and are generally brittle to slight modifications.In this paper, we propose Clustering-enhanced Linear Discriminative Analysis (CELDA), a novel approach that improves the text classification accuracy with a very weak-supervision signal (i.e., name of the labels).Our framework draws a precise decision boundary without accessing weights or gradients of the LM model or data labels.The core ideas of CELDA are twofold:(1) extracting a refined pseudo-labeled dataset from an unlabeled dataset, and (2) training a lightweight and robust model on the top of LM, which learns an accurate decision boundary from an extracted noisy dataset.Throughout in-depth investigations on various datasets, we demonstrated that CELDA reaches new state-of-the-art in weakly-supervised text classification and narrows the gap with a fully-supervised model.Additionally, our proposed methodology can be applied universally to any LM and has the potential to scale to larger models, making it a more viable option for utilizing large LMs.",
+    "authors": [
+      "Hyunsoo Cho",
+      "Youna Kim",
+      "Sang-goo Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.239",
+    "point2d": [
+      -2.5969550609588623,
+      -19.658491134643555
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 241,
+    "title": "MvP: Multi-view Prompting Improves Aspect Sentiment Tuple Prediction",
+    "abstract": "Generative methods greatly promote aspect-based sentiment analysis via generating a sequence of sentiment elements in a specified format. However, existing studies usually predict sentiment elements in a fixed order, which ignores the effect of the interdependence of the elements in a sentiment tuple and the diversity of language expression on the results. In this work, we propose Multi-view Prompting (MVP) that aggregates sentiment elements generated in different orders, leveraging the intuition of human-like problem-solving processes from different views. Specifically, MVP introduces element order prompts to guide the language model to generate multiple sentiment tuples, each with a different element order, and then selects the most reasonable tuples by voting. MVP can naturally model multi-view and multi-task as permutations and combinations of elements, respectively, outperforming previous task-specific designed methods on multiple ABSA tasks with a single model. Extensive experiments show that MVP significantly advances the state-of-the-art performance on 10 datasets of 4 benchmark tasks, and performs quite effectively in low-resource settings. Detailed evaluation verified the effectiveness, flexibility, and cross-task transferability of MVP.",
+    "authors": [
+      "Zhibin Gou",
+      "Qingyan Guo",
+      "Yujiu Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.240",
+    "point2d": [
+      17.54715347290039,
+      -33.78998947143555
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 242,
+    "title": "ACCENT: An Automatic Event Commonsense Evaluation Metric for Open-Domain Dialogue Systems",
+    "abstract": "Commonsense reasoning is omnipresent in human communications and thus is an important feature for open-domain dialogue systems. However, evaluating commonsense in dialogue systems is still an open challenge. We take the first step by focusing on event commonsense that considers events and their relations, and is crucial in both dialogues and general commonsense reasoning. We propose ACCENT, an event commonsense evaluation metric empowered by commonsense knowledge bases (CSKBs). ACCENT first extracts event-relation tuples from a dialogue, and then evaluates the response by scoring the tuples in terms of their compatibility with the CSKB. To evaluate ACCENT, we construct the first public event commonsense evaluation dataset for open-domain dialogues.Our experiments show that ACCENT is an efficient metric for event commonsense evaluation, which achieves higher correlations with human judgments than existing baselines.",
+    "authors": [
+      "Sarik Ghazarian",
+      "Yijia Shao",
+      "Rujun Han",
+      "Aram Galstyan",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.241",
+    "point2d": [
+      12.829853057861328,
+      55.25410079956055
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 243,
+    "title": "Explanation-based Finetuning Makes Models More Robust to Spurious Cues",
+    "abstract": "Large Language Models (LLMs) are so powerful that they sometimes learn correlations between labels and features that are irrelevant to the task, leading to poor generalization on out-of-distribution data. We propose explanation-based finetuning as a general approach to mitigate LLMs\u2019 reliance on spurious correlations. Unlike standard finetuning where the model only predicts the answer given the input, we finetune the model to additionally generate a free-text explanation supporting its answer. To evaluate our method, we finetune the model on artificially constructed training sets containing different types of spurious cues, and test it on a test set without these cues. Compared to standard finetuning, our method makes GPT-3 (davinci) remarkably more robust against spurious cues in terms of accuracy drop across four classification tasks: ComVE (+1.2), CREAK (+9.1), e-SNLI (+15.4), and SBIC (+6.5). The efficacy generalizes across multiple model families and scales, with greater gains for larger models. Finally, our method also works well with explanations generated by the model, implying its applicability to more datasets without human-written explanations.",
+    "authors": [
+      "Josh Magnus Ludan",
+      "Yixuan Meng",
+      "Tai Nguyen",
+      "Saurabh Shah",
+      "Qing Lyu",
+      "Marianna Apidianaki",
+      "Chris Callison-Burch"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.242",
+    "point2d": [
+      30.15773582458496,
+      -9.758397102355957
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 244,
+    "title": "CAME: Confidence-guided Adaptive Memory Efficient Optimization",
+    "abstract": "Adaptive gradient methods, such as Adam and LAMB, have demonstrated excellent performance in the training of large language models. Nevertheless, the need for adaptivity requires maintaining second-moment estimates of the per-parameter gradients, which entails a high cost of extra memory overheads. To solve this problem, several memory-efficient optimizers (e.g., Adafactor) have been proposed to obtain a drastic reduction in auxiliary memory usage, but with a performance penalty. In this paper, we first study a confidence-guided strategy to reduce the instability of existing memory efficient optimizers. Based on this strategy, we propose CAME to simultaneously achieve two goals: fast convergence as in traditional adaptive methods, and low memory usage as in memory-efficient methods. Extensive experiments demonstrate the training stability and superior performance of CAME across various NLP tasks such as BERT and GPT-2 training. Notably, for BERT pre-training on the large batch size of 32,768, our proposed optimizer attains faster convergence and higher accuracy compared with the Adam optimizer. The implementation of CAME is publicly available.",
+    "authors": [
+      "Yang Luo",
+      "Xiaozhe Ren",
+      "Zangwei Zheng",
+      "Zhuo Jiang",
+      "Xin Jiang",
+      "Yang You"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.243",
+    "point2d": [
+      -37.73629379272461,
+      -18.347681045532227
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 245,
+    "title": "On Second Thought, Let\u2019s Not Think Step by Step! Bias and Toxicity in Zero-Shot Reasoning",
+    "abstract": "Generating a Chain of Thought (CoT) has been shown to consistently improve large language model (LLM) performance on a wide range of NLP tasks. However, prior work has mainly focused on logical reasoning tasks (e.g. arithmetic, commonsense QA); it remains unclear whether improvements hold for more diverse types of reasoning, especially in socially situated contexts. Concretely, we perform a controlled evaluation of zero-shot CoT across two socially sensitive domains: harmful questions and stereotype benchmarks. We find that zero-shot CoT reasoning in sensitive domains significantly increases a model\u2019s likelihood to produce harmful or undesirable output, with trends holding across different prompt formats and model variants. Furthermore, we show that harmful CoTs increase with model size, but decrease with improved instruction following. Our work suggests that zero-shot CoT should be used with caution on socially important tasks, especially when marginalized groups or sensitive topics are involved.",
+    "authors": [
+      "Omar Shaikh",
+      "Hongxin Zhang",
+      "William Held",
+      "Michael Bernstein",
+      "Diyi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.244",
+    "point2d": [
+      42.582515716552734,
+      -9.242544174194336
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 246,
+    "title": "Solving Math Word Problems via Cooperative Reasoning induced Language Models",
+    "abstract": "Large-scale pre-trained language models (PLMs) bring new opportunities to challenging problems, especially those that need high-level intelligence, such as the math word problem (MWPs). However, directly applying existing PLMs to MWPs can fail as the generation process lacks sufficient supervision and thus lacks fast adaptivity as humans. We notice that human reasoning has a dual reasoning framework that consists of an immediate reaction system (system 1) and a delicate reasoning system (system 2), where the entire reasoning is determined by their interaction. This inspires us to develop a cooperative reasoning-induced PLM for solving MWPs, called Cooperative Reasoning (CoRe), resulting in a human-like reasoning architecture with system 1 as the generator and system 2 as the verifier. In our approach, the generator is responsible for generating reasoning paths, and the verifiers are used to supervise the evaluation in order to obtain reliable feedback for the generator. We evaluate our CoRe framework on several mathematical reasoning datasets and achieve decent improvement over state-of-the-art methods, up to 9.6% increase over best baselines.",
+    "authors": [
+      "Xinyu Zhu",
+      "Junjie Wang",
+      "Lin Zhang",
+      "Yuxiang Zhang",
+      "Yongfeng Huang",
+      "Ruyi Gan",
+      "Jiaxing Zhang",
+      "Yujiu Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.245",
+    "point2d": [
+      45.53635787963867,
+      -20.182485580444336
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 247,
+    "title": "Exploiting Biased Models to De-bias Text: A Gender-Fair Rewriting Model",
+    "abstract": "Natural language generation models reproduce and often amplify the biases present in their training data. Previous research explored using sequence-to-sequence rewriting models to transform biased model outputs (or original texts) into more gender-fair language by creating pseudo training data through linguistic rules. However, this approach is not practical for languages with more complex morphology than English. We hypothesise that creating training data in the reverse direction, i.e. starting from gender-fair text, is easier for morphologically complex languages and show that it matches the performance of state-of-the-art rewriting models for English. To eliminate the rule-based nature of data creation, we instead propose using machine translation models to create gender-biased text from real gender-fair text via round-trip translation. Our approach allows us to train a rewriting model for German without the need for elaborate handcrafted rules. The outputs of this model increased gender-fairness as shown in a human evaluation study.",
+    "authors": [
+      "Chantal Amrhein",
+      "Florian Schottmann",
+      "Rico Sennrich",
+      "Samuel L\u00e4ubli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.246",
+    "point2d": [
+      -20.176164627075195,
+      12.243106842041016
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 248,
+    "title": "Early Discovery of Disappearing Entities in Microblogs",
+    "abstract": "We make decisions by reacting to changes in the real world, particularly the emergence and disappearance of impermanent entities such as restaurants, services, and events. Because we want to avoid missing out on opportunities or making fruitless actions after those entities have disappeared, it is important to know when entities disappear as early as possible.We thus tackle the task of detecting disappearing entities from microblogs where various information is shared timely.The major challenge is detecting uncertain contexts of disappearing entities from noisy microblog posts.To collect such disappearing contexts, we design time-sensitive distant supervision, which utilizes entities from the knowledge base and time-series posts.Using this method, we actually build large-scale Twitter datasets of disappearing entities.To ensure robust detection in noisy environments, we refine pretrained word embeddings for the detection model on microblog streams in a timely manner.Experimental results on the Twitter datasets confirmed the effectiveness of the collected labeled data and refined word embeddings; the proposed method outperformed a baseline in terms of accuracy, and more than 70% of the detected disappearing entities in Wikipedia are discovered earlier than the update on Wikipedia, with the average lead-time is over one month.",
+    "authors": [
+      "Satoshi Akasaki",
+      "Naoki Yoshinaga",
+      "Masashi Toyoda"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.247",
+    "point2d": [
+      36.79417419433594,
+      20.39848518371582
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 249,
+    "title": "DiffusionBERT: Improving Generative Masked Language Models with Diffusion Models",
+    "abstract": "We present DiffusionBERT, a new generative masked language model based on discrete dif- fusion models. Diffusion models and many pre- trained language models have a shared training objective, i.e., denoising, making it possible to combine the two powerful models and enjoy the best of both worlds. On the one hand, dif- fusion models offer a promising training strat- egy that helps improve the generation quality. On the other hand, pre-trained denoising lan- guage models (e.g., BERT) can be used as a good initialization that accelerates convergence. We explore training BERT to learn the reverse process of a discrete diffusion process with an absorbing state and elucidate several designs to improve it. First, we propose a new noise schedule for the forward diffusion process that controls the degree of noise added at each step based on the information of each token. Sec- ond, we investigate several designs of incorpo- rating the time step into BERT. Experiments on unconditional text generation demonstrate that DiffusionBERT achieves significant improve- ment over existing diffusion models for text (e.g., D3PM and Diffusion-LM) and previous generative masked language models in terms of perplexity and BLEU score. Promising re- sults in conditional generation tasks show that DiffusionBERT can generate texts of compa- rable quality and more diverse than a series of established baselines.",
+    "authors": [
+      "Zhengfu He",
+      "Tianxiang Sun",
+      "Qiong Tang",
+      "Kuanning Wang",
+      "Xuanjing Huang",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.248",
+    "point2d": [
+      -28.4321346282959,
+      7.795199394226074
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 250,
+    "title": "Lifting the Curse of Capacity Gap in Distilling Language Models",
+    "abstract": "Pretrained language models (LMs) have shown compelling performance on various downstream tasks, but unfortunately they require a tremendous amount of inference compute. Knowledge distillation finds a path to compress LMs to small ones with a teacher-student paradigm. However, when the capacity gap between the teacher and the student is large, a curse of capacity gap appears, invoking a deficiency in distilling LMs. While a few studies have been carried out to fill the gap, the curse is not yet well tackled. In this paper, we aim at lifting the curse of capacity gap via enlarging the capacity of the student without notably increasing the inference compute. Largely motivated by sparse activation regime of mixture of experts (MoE), we propose a mixture of minimal experts (MiniMoE), which imposes extra parameters to the student but introduces almost no additional inference compute. Experimental results on GLUE and CoNLL demonstrate the curse of capacity gap is lifted by the magic of MiniMoE to a large extent. MiniMoE also achieves the state-of-the-art performance at small FLOPs compared with a range of competitive baselines. With a compression rate as much as ~50\\times, MiniMoE preserves ~95% GLUE score of the teacher.",
+    "authors": [
+      "Chen Zhang",
+      "Yang Yang",
+      "Jiahao Liu",
+      "Jingang Wang",
+      "Yunsen Xian",
+      "Benyou Wang",
+      "Dawei Song"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.249",
+    "point2d": [
+      -47.12397384643555,
+      -20.732349395751953
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 251,
+    "title": "Towards Faithful Dialogues via Focus Learning",
+    "abstract": "Maintaining faithfulness between responses and knowledge is an important research topic for building reliable knowledge-grounded dialogue systems. Existing models heavily rely on elaborate data engineering or increasing the model\u2019s parameters ignoring to track the tokens that significantly influence losses, which is decisive for the optimization direction of the model in each iteration. To address this issue, we propose Focus Learning (FocusL), a novel learning approach that adjusts the contribution of each token to the optimization direction by directly scaling the corresponding objective loss. Specifically, we first introduce a positioning method by utilizing similarity distributions between knowledge and each response token to locate knowledge-aware tokens. Then, we further design a similarity-to-weight transformation to provide dynamic token-level weights for the cross-entropy loss. Finally, we use the weighted loss to encourage the model to pay special attention to the knowledge utilization. Experimental results demonstrate that our method achieves the new state-of-the-art results and generates more reliable responses while maintaining training stability.",
+    "authors": [
+      "Yifan Deng",
+      "Xingsheng Zhang",
+      "Heyan Huang",
+      "Yue Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.250",
+    "point2d": [
+      15.341465950012207,
+      58.70559310913086
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 252,
+    "title": "Back Translation for Speech-to-text Translation Without Transcripts",
+    "abstract": "The success of end-to-end speech-to-text translation (ST) is often achieved by utilizing source transcripts, e.g., by pre-training with automatic speech recognition (ASR) and machine translation (MT) tasks, or by introducing additional ASR and MT data. Unfortunately, transcripts are only sometimes available since numerous unwritten languages exist worldwide. In this paper, we aim to utilize large amounts of target-side monolingual data to enhance ST without transcripts. Motivated by the remarkable success of back translation in MT, we develop a back translation algorithm for ST (BT4ST) to synthesize pseudo ST data from monolingual target data. To ease the challenges posed by short-to-long generation and one-to-many mapping, we introduce self-supervised discrete units and achieve back translation by cascading a target-to-unit model and a unit-to-speech model. With our synthetic ST data, we achieve an average boost of 2.3 BLEU on MuST-C En-De, En-Fr, and En-Es datasets. More experiments show that our method is especially effective in low-resource scenarios.",
+    "authors": [
+      "Qingkai Fang",
+      "Yang Feng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.251",
+    "point2d": [
+      -68.91541290283203,
+      17.444854736328125
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 253,
+    "title": "Prompter: Zero-shot Adaptive Prefixes for Dialogue State Tracking Domain Adaptation",
+    "abstract": "A challenge in the Dialogue State Tracking (DST) field is adapting models to new domains without using any supervised data \u2014 zero-shot domain adaptation. Parameter-Efficient Transfer Learning (PETL) has the potential to address this problem due to its robustness. However, it has yet to be applied to the zero-shot scenarios, as it is not clear how to apply it unsupervisedly. Our method, Prompter, uses descriptions of target domain slots to generate dynamic prefixes that are concatenated to the key and values at each layer\u2019s self-attention mechanism. This allows for the use of prefix-tuning in zero-shot. Prompter outperforms previous methods on both the MultiWOZ and SGD benchmarks. In generating prefixes, our analyses find that Prompter not only utilizes the semantics of slot descriptions but also how often the slots appear together in conversation. Moreover, Prompter\u2019s gains are due to its improved ability to distinguish \u201dnone\u201d-valued dialogue slots, compared against baselines.",
+    "authors": [
+      "Ibrahim Taha Aksu",
+      "Min-Yen Kan",
+      "Nancy Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.252",
+    "point2d": [
+      1.8944752216339111,
+      71.7250747680664
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 254,
+    "title": "Enhancing Dialogue Generation via Dynamic Graph Knowledge Aggregation",
+    "abstract": "Incorporating external graph knowledge into neural chatbot models has been proven effective for enhancing dialogue generation. However, in conventional graph neural networks (GNNs), message passing on a graph is independent from text, resulting in the graph representation hidden space differing from that of the text. This training regime of existing models therefore leads to a semantic gap between graph knowledge and text. In this study, we propose a novel framework for knowledge graph enhanced dialogue generation. We dynamically construct a multi-hop knowledge graph with pseudo nodes to involve the language model in feature aggregation within the graph at all steps. To avoid the semantic biases caused by learning on vanilla subgraphs, the proposed framework applies hierarchical graph attention to aggregate graph features on pseudo nodes and then attains a global feature. Therefore, the framework can better utilise the heterogeneous features from both the post and external graph knowledge. Extensive experiments demonstrate that our framework outperforms state-of-the-art (SOTA) baselines on dialogue generation. Further analysis also shows that our representation learning framework can fill the semantic gap by coagulating representations of both text and graph knowledge. Moreover, the language model also learns how to better select knowledge triples for a more informative response via exploiting subgraph patterns within our feature aggregation process. Our code and resources are available at https://github.com/tangg555/SaBART.",
+    "authors": [
+      "Chen Tang",
+      "Hongbo Zhang",
+      "Tyler Loakman",
+      "Chenghua Lin",
+      "Frank Guerin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.253",
+    "point2d": [
+      -0.8741496801376343,
+      54.429561614990234
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 255,
+    "title": "Multi-modal Action Chain Abductive Reasoning",
+    "abstract": "\n        Abductive Reasoning, has long been considered to be at the core ability of humans, which enables us to infer the most plausible explanation of incomplete known phenomena in daily life. However, such critical reasoning capability is rarely investigated for contemporary AI systems under such limited observations. To facilitate this research community, this paper sheds new light on Abductive Reasoning by studying a new vision-language task, Multi-modal Action chain abductive Reasoning (MAR), together with a large-scale Abductive Reasoning dataset: Given an incomplete set of language described events, MAR aims to imagine the most plausible event by spatio-temporal grounding in past video and then infer the hypothesis of subsequent action chain that can best explain the language premise. To solve this task, we propose a strong baseline model that realizes MAR from two perspectives: (i) we first introduce the transformer, which learns to encode the observation to imagine the plausible event with explicitly interpretable event grounding in the video based on the commonsense knowledge recognition ability. (ii) To complete the assumption of a follow-up action chain, we design a novel symbolic module that can complete strict derivation of the progressive action chain layer by layer. We conducted extensive experiments on the proposed dataset, and the experimental study shows that the proposed model significantly outperforms existing video-language models in terms of effectiveness on our newly created MAR dataset.",
+    "authors": [
+      "Mengze Li",
+      "Tianbao Wang",
+      "Jiahe Xu",
+      "Kairong Han",
+      "Shengyu Zhang",
+      "Zhou Zhao",
+      "Jiaxu Miao",
+      "Wenqiao Zhang",
+      "Shiliang Pu",
+      "Fei Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.254",
+    "point2d": [
+      56.581363677978516,
+      -16.271909713745117
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 256,
+    "title": "Exploring the Capacity of Pretrained Language Models for Reasoning about Actions and Change",
+    "abstract": "Reasoning about actions and change (RAC) is essential to understand and interact with the ever-changing environment. Previous AI research has shown the importance of fundamental and indispensable knowledge of actions, i.e., preconditions and effects. However, traditional methods rely on logical formalization which hinders practical applications. With recent transformer-based language models (LMs), reasoning over text is desirable and seemingly feasible, leading to the question of whether LMs can effectively and efficiently learn to solve RAC problems. We propose four essential RAC tasks as a comprehensive textual benchmark and generate problems in a way that minimizes the influence of other linguistic requirements (e.g., grounding) to focus on RAC. The resulting benchmark, TRAC, encompassing problems of various complexities, facilitates a more granular evaluation of LMs, precisely targeting the structural generalization ability much needed for RAC. Experiments with three high-performing transformers indicate that additional efforts are needed to tackle challenges raised by TRAC.",
+    "authors": [
+      "Weinan He",
+      "Canming Huang",
+      "Zhanhao Xiao",
+      "Yongmei Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.255",
+    "point2d": [
+      48.9527587890625,
+      -8.701274871826172
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 257,
+    "title": "Unified Demonstration Retriever for In-Context Learning",
+    "abstract": "In-context learning is a new learning paradigm where a language model conditions on a few input-output pairs (demonstrations) and a test input, and directly outputs the prediction. It has been shown sensitive to the provided demonstrations and thus promotes the research of demonstration retrieval: given a test input, relevant examples are retrieved from the training set to serve as informative demonstrations for in-context learning. While previous works train task-specific retrievers for several tasks separately, these methods are hard to transfer and scale on various tasks, and separately trained retrievers will cause a lot of parameter storage and deployment cost. In this paper, we propose Unified Demonstration Retriever (UDR), a single model to retrieve demonstrations for a wide range of tasks. To train UDR, we cast various tasks\u2019 training signals into a unified list-wise ranking formulation by language model\u2019s feedback. Then we propose a multi-task list-wise ranking training framework with an iterative mining strategy to find high-quality candidates, which can help UDR fully incorporate various tasks\u2019 signals. Experiments on 30+ tasks across 13 task families and multiple data domains show that UDR significantly outperforms baselines. Further analyses show the effectiveness of each proposed component and UDR\u2019s strong ability in various scenarios including different LMs (1.3B 175B), unseen datasets, varying demonstration quantities, etc. We will release the code and model checkpoint after review.",
+    "authors": [
+      "Xiaonan Li",
+      "Kai Lv",
+      "Hang Yan",
+      "Tianyang Lin",
+      "Wei Zhu",
+      "Yuan Ni",
+      "Guotong Xie",
+      "Xiaoling Wang",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.256",
+    "point2d": [
+      -13.292579650878906,
+      -17.07717514038086
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 258,
+    "title": "Movie101: A New Movie Understanding Benchmark",
+    "abstract": "To help the visually impaired enjoy movies, automatic movie narrating systems are expected to narrate accurate, coherent, and role-aware plots when there are no speaking lines of actors. Existing works benchmark this challenge as a normal video captioning task via some simplifications, such as removing role names and evaluating narrations with ngram-based metrics, which makes it difficult for automatic systems to meet the needs of real application scenarios. To narrow this gap, we construct a large-scale Chinese movie benchmark, named Movie101. Closer to real scenarios, the Movie Clip Narrating (MCN) task in our benchmark asks models to generate role-aware narration paragraphs for complete movie clips where no actors are speaking. External knowledge, such as role information and movie genres, is also provided for better movie understanding. Besides, we propose a new metric called Movie Narration Score (MNScore) for movie narrating evaluation, which achieves the best correlation with human evaluation. Our benchmark also supports the Temporal Narration Grounding (TNG) task to investigate clip localization given text descriptions. For both two tasks, our proposed methods well leverage external knowledge and outperform carefully designed baselines. The dataset and codes are released at https://github.com/yuezih/Movie101.",
+    "authors": [
+      "Zihao Yue",
+      "Qi Zhang",
+      "Anwen Hu",
+      "Liang Zhang",
+      "Ziheng Wang",
+      "Qin Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.257",
+    "point2d": [
+      -57.22037124633789,
+      50.29429244995117
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 259,
+    "title": "Enhancing Language Representation with Constructional Information for Natural Language Understanding",
+    "abstract": "Natural language understanding (NLU) is an essential branch of natural language processing, which relies on representations generated by pre-trained language models (PLMs). However, PLMs primarily focus on acquiring lexico-semantic information, while they may be unable to adequately handle the meaning of constructions. To address this issue, we introduce construction grammar (CxG), which highlights the pairings of form and meaning, to enrich language representation. We adopt usage-based construction grammar as the basis of our work, which is highly compatible with statistical models such as PLMs. Then a HyCxG framework is proposed to enhance language representation through a three-stage solution. First, all constructions are extracted from sentences via a slot-constraints approach. As constructions can overlap with each other, bringing redundancy and imbalance, we formulate the conditional max coverage problem for selecting the discriminative constructions. Finally, we propose a relational hypergraph attention network to acquire representation from constructional information by capturing high-order word interactions among constructions. Extensive experiments demonstrate the superiority of the proposed model on a variety of NLU tasks.",
+    "authors": [
+      "Lvxiaowei Xu",
+      "Jianwang Wu",
+      "Jiawei Peng",
+      "Zhilin Gong",
+      "Ming Cai",
+      "Tianxiang Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.258",
+    "point2d": [
+      10.11721420288086,
+      -50.184104919433594
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 260,
+    "title": "Query Structure Modeling for Inductive Logical Reasoning Over Knowledge Graphs",
+    "abstract": "Logical reasoning over incomplete knowledge graphs to answer complex logical queries is a challenging task. With the emergence of new entities and relations in constantly evolving KGs, inductive logical reasoning over KGs has become a crucial problem. However, previous PLMs-based methods struggle to model the logical structures of complex queries, which limits their ability to generalize within the same structure. In this paper, we propose a structure-modeled textual encoding framework for inductive logical reasoning over KGs. It encodes linearized query structures and entities using pre-trained language models to find answers. For structure modeling of complex queries, we design stepwise instructions that implicitly prompt PLMs on the execution order of geometric operations in each query. We further separately model different geometric operations (i.e., projection, intersection, and union) on the representation space using a pre-trained encoder with additional attention and maxout layers to enhance structured modeling. We conduct experiments on two inductive logical reasoning datasets and three transductive datasets. The results demonstrate the effectiveness of our method on logical reasoning over KGs in both inductive and transductive settings.",
+    "authors": [
+      "Siyuan Wang",
+      "Zhongyu Wei",
+      "Meng Han",
+      "Zhihao Fan",
+      "Haijun Shan",
+      "Qi Zhang",
+      "Xuanjing Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.259",
+    "point2d": [
+      54.40731430053711,
+      -61.03453063964844
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 261,
+    "title": "DimonGen: Diversified Generative Commonsense Reasoning for Explaining Concept Relationships",
+    "abstract": "In this paper, we propose DimonGen, which aims to generate diverse sentences describing concept relationships in various everyday scenarios. To support this, we first create a benchmark dataset for this task by adapting the existing CommonGen dataset. We then propose a two-stage model called MoREE to generate the target sentences. MoREE consists of a mixture of retrievers model that retrieves diverse context sentences related to the given concepts, and a mixture of generators model that generates diverse sentences based on the retrieved contexts. We conduct experiments on the DimonGen task and show that MoREE outperforms strong baselines in terms of both the quality and diversity of the generated sentences. Our results demonstrate that MoREE is able to generate diverse sentences that reflect different relationships between concepts, leading to a comprehensive understanding of concept relationships.",
+    "authors": [
+      "Chenzhengyi Liu",
+      "Jie Huang",
+      "Kerui Zhu",
+      "Kevin Chen-Chuan Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.260",
+    "point2d": [
+      57.19218063354492,
+      -20.115392684936523
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 262,
+    "title": "Incorporating Attribution Importance for Improving Faithfulness Metrics",
+    "abstract": "Feature attribution methods (FAs) are popular approaches for providing insights into the model reasoning process of making predictions. The more faithful a FA is, the more accurately it reflects which parts of the input are more important for the prediction. Widely used faithfulness metrics, such as sufficiency and comprehensiveness use a hard erasure criterion, i.e. entirely removing or retaining the top most important tokens ranked by a given FA and observing the changes in predictive likelihood. However, this hard criterion ignores the importance of each individual token, treating them all equally for computing sufficiency and comprehensiveness. In this paper, we propose a simple yet effective soft erasure criterion. Instead of entirely removing or retaining tokens from the input, we randomly mask parts of the token vector representations proportionately to their FA importance. Extensive experiments across various natural language processing tasks and different FAs show that our soft-sufficiency and soft-comprehensiveness metrics consistently prefer more faithful explanations compared to hard sufficiency and comprehensiveness.",
+    "authors": [
+      "Zhixue Zhao",
+      "Nikolaos Aletras"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.261",
+    "point2d": [
+      -43.757266998291016,
+      -32.673011779785156
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 263,
+    "title": "Reward Gaming in Conditional Text Generation",
+    "abstract": "To align conditional text generation model outputs with desired behaviors, there has been an increasing focus on training the model using reinforcement learning (RL) with reward functions learned from human annotations. Under this framework, we identify three common cases where high rewards are incorrectly assigned to undesirable patterns: noise-induced spurious correlation, naturally occurring spurious correlation, and covariate shift. We show that even though learned metrics achieve high performance on the distribution of the data used to train the reward function, the undesirable patterns may be amplified during RL training of the text generation model. While there has been discussion about reward gaming in the RL or safety community, in this discussion piece, we would like to highlight reward gaming in the natural language generation (NLG) community using concrete conditional text generation examples and discuss potential fixes and areas for future work.",
+    "authors": [
+      "Richard Yuanzhe Pang",
+      "Vishakh Padmakumar",
+      "Thibault Sellam",
+      "Ankur Parikh",
+      "He He"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.262",
+    "point2d": [
+      -17.93986701965332,
+      9.124712944030762
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 264,
+    "title": "Hidden Schema Networks",
+    "abstract": "Large, pretrained language models infer powerful representations that encode rich semantic and syntactic content, albeit implicitly. In this work we introduce a novel neural language model that enforces, via inductive biases, explicit relational structures which allow for compositionality onto the output representations of pretrained language models. Specifically, the model encodes sentences into sequences of symbols (composed representations), which correspond to the nodes visited by biased random walkers on a global latent graph, and infers the posterior distribution of the latter. We first demonstrate that the model is able to uncover ground-truth graphs from artificially generated datasets of random token sequences. Next, we leverage pretrained BERT and GPT-2 language models as encoder and decoder, respectively, to infer networks of symbols (schemata) from natural language datasets. Our experiments show that (i) the inferred symbols can be interpreted as encoding different aspects of language, as e.g. topics or sentiments, and that (ii) GPT-2-like models can effectively be conditioned on symbolic representations. Finally, we explore training autoregressive, random walk \u201creasoning\u201d models on schema networks inferred from commonsense knowledge databases, and using the sampled paths to enhance the performance of pretrained language models on commonsense If-Then reasoning tasks.",
+    "authors": [
+      "Ramses Sanchez",
+      "Lukas Conrads",
+      "Pascal Welke",
+      "Kostadin Cvejoski",
+      "Cesar Ojeda Marin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.263",
+    "point2d": [
+      51.147037506103516,
+      -26.439847946166992
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 265,
+    "title": "Towards Robust Low-Resource Fine-Tuning with Multi-View Compressed Representations",
+    "abstract": "Due to the huge amount of parameters, finetuning of pretrained language models (PLMs) is prone to overfitting in the low resource scenarios. In this work, we present a novel method that operates on the hidden representations of a PLM to reduce overfitting. During fine-tuning, our method inserts random autoencoders between the hidden layers of a PLM, which transform activations from the previous layers into multi-view compressed representations before feeding them into the upper layers. The autoencoders are plugged out after fine-tuning, so our method does not add extra parameters or increase computation cost during inference. Our method demonstrates promising performance improvement across a wide range of sequence- and token-level lowresource NLP tasks.",
+    "authors": [
+      "Linlin Liu",
+      "Xingxuan Li",
+      "Megh Thakkar",
+      "Xin Li",
+      "Shafiq Joty",
+      "Luo Si",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.264",
+    "point2d": [
+      -34.00726318359375,
+      -23.2650203704834
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 266,
+    "title": "An Ordinal Latent Variable Model of Conflict Intensity",
+    "abstract": "Measuring the intensity of events is crucial for monitoring and tracking armed conflict. Advances in automated event extraction have yielded massive data sets of \u201cwho did what to whom\u201d micro-records that enable data-driven approaches to monitoring conflict. The Goldstein scale is a widely-used expert-based measure that scores events on a conflictual\u2013cooperative scale. It is based only on the action category (\u201cwhat\u201d) and disregards the subject (\u201cwho\u201d) and object (\u201cto whom\u201d) of an event, as well as contextual information, like associated casualty count, that should contribute to the perception of an event\u2019s \u201cintensity\u201d. This paper takes a latent variable-based approach to measuring conflict intensity. We introduce a probabilistic generative model that assumes each observed event is associated with a latent intensity class. A novel aspect of this model is that it imposes an ordering on the classes, such that higher-valued classes denote higher levels of intensity. The ordinal nature of the latent variable is induced from naturally ordered aspects of the data (e.g., casualty counts) where higher values naturally indicate higher intensity. We evaluate the proposed model both intrinsically and extrinsically, showing that it obtains comparatively good held-out predictive performance.",
+    "authors": [
+      "Niklas Stoehr",
+      "Lucas Torroba Hennigen",
+      "Josef Valvoda",
+      "Robert West",
+      "Ryan Cotterell",
+      "Aaron Schein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.265",
+    "point2d": [
+      39.927852630615234,
+      33.347679138183594
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 267,
+    "title": "Multilingual Conceptual Coverage in Text-to-Image Models",
+    "abstract": "We propose \u201cConceptual Coverage Across Languages\u201d (CoCo-CroLa), a technique for benchmarking the degree to which any generative text-to-image system provides multilingual parity to its training language in terms of tangible nouns. For each model we can assess \u201cconceptual coverage\u201d of a given target language relative to a source language by comparing the population of images generated for a series of tangible nouns in the source language to the population of images generated for each noun under translation in the target language. This technique allows us to estimate how well-suited a model is to a target language as well as identify model-specific weaknesses, spurious correlations, and biases without a-priori assumptions. We demonstrate how it can be used to benchmark T2I models in terms of multilinguality, and how despite its simplicity it is a good proxy for impressive generalization.",
+    "authors": [
+      "Michael Saxon",
+      "William Yang Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.266",
+    "point2d": [
+      -65.87401580810547,
+      42.953739166259766
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 268,
+    "title": "Pre-Training to Learn in Context",
+    "abstract": "In-context learning, where pre-trained language models learn to perform tasks from task examples and instructions in their contexts, has attracted much attention in the NLP community. However, the ability of in-context learning is not fully exploited because language models are not explicitly trained to learn in context. To this end, we propose PICL (Pre-training for In-Context Learning), a framework to enhance the language models\u2019 in-context learning ability by pre-training the model on a large collection of \u201cintrinsic tasks\u201d in the general plain-text corpus using the simple language modeling objective. PICL encourages the model to infer and perform tasks by conditioning on the contexts while maintaining task generalization of pre-trained models. We evaluate the in-context learning performance of the model trained with PICL on seven widely-used text classification datasets and the Super-NaturalInstrctions benchmark, which contains 100+ NLP tasks formulated to text generation. Our experiments show that PICL is more effective and task-generalizable than a range of baselines, outperforming larger language models with nearly 4x parameters. The code is publicly available at https://github.com/thu-coai/PICL.",
+    "authors": [
+      "Yuxian Gu",
+      "Li Dong",
+      "Furu Wei",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.267",
+    "point2d": [
+      -15.757760047912598,
+      -20.65236473083496
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 269,
+    "title": "Ethical Considerations for Machine Translation of Indigenous Languages: Giving a Voice to the Speakers",
+    "abstract": "In recent years machine translation has become very successful for high-resource language pairs. This has also sparked new interest in research on the automatic translation of low-resource languages, including Indigenous languages. However, the latter are deeply related to the ethnic and cultural groups that speak (or used to speak) them. The data collection, modeling and deploying machine translation systems thus result in new ethical questions that must be addressed. Motivated by this, we first survey the existing literature on ethical considerations for the documentation, translation, and general natural language processing for Indigenous languages. Afterward, we conduct and analyze an interview study to shed light on the positions of community leaders, teachers, and language activists regarding ethical concerns for the automatic translation of their languages. Our results show that the inclusion, at different degrees, of native speakers and community members is vital to performing better and more ethical research on Indigenous languages.",
+    "authors": [
+      "Manuel Mager",
+      "Elisabeth Mager",
+      "Katharina Kann",
+      "Ngoc Thang Vu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.268",
+    "point2d": [
+      -52.441062927246094,
+      -2.452829360961914
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 270,
+    "title": "Revisiting non-English Text Simplification: A Unified Multilingual Benchmark",
+    "abstract": "Recent advancements in high-quality, large-scale English resources have pushed the frontier of English Automatic Text Simplification (ATS) research. However, less work has been done on multilingual text simplification due to the lack of a diverse evaluation benchmark that covers complex-simple sentence pairs in many languages. This paper introduces the MultiSim benchmark, a collection of 27 resources in 12 distinct languages containing over 1.7 million complex-simple sentence pairs. This benchmark will encourage research in developing more effective multilingual text simplification models and evaluation metrics. Our experiments using MultiSim with pre-trained multilingual language models reveal exciting performance improvements from multilingual training in non-English settings. We observe strong performance from Russian in zero-shot cross-lingual transfer to low-resource languages. We further show that few-shot prompting with BLOOM-176b achieves comparable quality to reference simplifications outperforming fine-tuned models in most languages. We validate these findings through human evaluation.",
+    "authors": [
+      "Michael Ryan",
+      "Tarek Naous",
+      "Wei Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.269",
+    "point2d": [
+      -31.758544921875,
+      28.204784393310547
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 271,
+    "title": "Don\u2019t Generate, Discriminate: A Proposal for Grounding Language Models to Real-World Environments",
+    "abstract": "A key missing capacity of current language models (LMs) is grounding to real-world environments. Most existing work for grounded language understanding uses LMs to directly generate plans that can be executed in the environment to achieve the desired effects. It thereby casts the burden of ensuring grammaticality, faithfulness, and controllability all on the LMs. We propose Pangu, a generic framework for grounded language understanding that capitalizes on the discriminative ability of LMs instead of their generative ability. Pangu consists of a symbolic agent and a neural LM working in a concerted fashion: The agent explores the environment to incrementally construct valid plans, and the LM evaluates the plausibility of the candidate plans to guide the search process. A case study on the challenging problem of knowledge base question answering (KBQA), which features a massive environment, demonstrates the remarkable effectiveness and flexibility of Pangu: A BERT-base LM is sufficient for setting a new record on standard KBQA datasets, and larger LMs further bring substantial gains.Pangu also enables, for the first time, effective few-shot in-context learning for KBQA with large LMs such as Codex.",
+    "authors": [
+      "Yu Gu",
+      "Xiang Deng",
+      "Yu Su"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.270",
+    "point2d": [
+      51.33708953857422,
+      -9.241580963134766
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 272,
+    "title": "Privacy-Preserving Domain Adaptation of Semantic Parsers",
+    "abstract": "Task-oriented dialogue systems often assist users with personal or confidential matters. For this reason, the developers of such a system are generally prohibited from observing actual usage. So how can they know where the system is failing and needs more training data or new functionality? In this work, we study ways in which realistic user utterances can be generated synthetically, to help increase the linguistic and functional coverage of the system, without compromising the privacy of actual users. To this end, we propose a two-stage Differentially Private (DP) generation method which first generates latent semantic parses, and then generates utterances based on the parses. Our proposed approach improves MAUVE by 2.5X and parse tree function-type overlap by 1.3X relative to current approaches for private synthetic data generation, improving both on fluency and semantic coverage. We further validate our approach on a realistic domain adaptation task of adding new functionality from private user data to a semantic parser, and show overall gains of 8.5% points on its accuracy with the new feature.",
+    "authors": [
+      "Fatemehsadat Mireshghallah",
+      "Yu Su",
+      "Tatsunori Hashimoto",
+      "Jason Eisner",
+      "Richard Shin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.271",
+    "point2d": [
+      9.89690113067627,
+      59.11174774169922
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 273,
+    "title": "Guide the Many-to-One Assignment: Open Information Extraction via IoU-aware Optimal Transport",
+    "abstract": "Open Information Extraction (OIE) seeks to extract structured information from raw text without the limitations of close ontology. Recently, the detection-based OIE methods have received great attention from the community due to their parallelism. However, as the essential step of those models, how to assign ground truth labels to the parallelly generated tuple proposals remains under-exploited. The commonly utilized Hungarian algorithm for this procedure is restricted to handling one-to-one assignment among the desired tuples and tuple proposals, which ignores the correlation between proposals and affects the recall of the models. To solve this problem, we propose a dynamic many-to-one label assignment strategy named IOT. Concretely, the label assignment process in OIE is formulated as an Optimal Transport (OT) problem. We leverage the intersection-over-union (IoU) as the assignment quality measurement, and convert the problem of finding the best assignment solution to the one of solving the optimal transport plan by maximizing the IoU values. To further utilize the knowledge from the assignment, we design an Assignment-guided Multi-granularity loss (AM) by simultaneously considering word-level and tuple-level information. Experiment results show the proposed method outperforms the state-of-the-art models on three benchmarks.",
+    "authors": [
+      "Kaiwen Wei",
+      "Yiran Yang",
+      "Li Jin",
+      "Xian Sun",
+      "Zequn Zhang",
+      "Jingyuan Zhang",
+      "Xiao Li",
+      "Linhao Zhang",
+      "Jintao Liu",
+      "Guo Zhi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.272",
+    "point2d": [
+      31.41368293762207,
+      -60.82722473144531
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 274,
+    "title": "Actively Supervised Clustering for Open Relation Extraction",
+    "abstract": "Current clustering-based Open Relation Extraction (OpenRE) methods usually adopt a two-stage pipeline, which simultaneously learns relation representations and assignments in the first stage, then manually labels relation for each cluster.However, unsupervised objectives struggle to explicitly optimize clusters to align with relational semantics, and the number of clusters K has to be supplied in advance.In this paper, we present a novel setting, named actively supervised clustering for OpenRE. Our insight lies in that clustering learning and relation labeling can be performed simultaneously, which provides the necessary guidance for clustering without a significant increase in human effort. Along with this setting, we propose an active labeling strategy tailored for clustering. Instead of only focusing on improving the clustering of relations that have been discovered, our strategy is encouraged to discover new relations through diversity regularization. This is particularly beneficial for long-tail relations in the real world.Experimental results show that our method is able to discover almost all relational clusters in the data and improve the SOTA methods by 13.8% and 10.6%, on two datasets respectively.",
+    "authors": [
+      "Jun Zhao",
+      "Yongxin Zhang",
+      "Qi Zhang",
+      "Tao Gui",
+      "Zhongyu Wei",
+      "Minlong Peng",
+      "Mingming Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.273",
+    "point2d": [
+      38.07052230834961,
+      -65.57079315185547
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 275,
+    "title": "ConvGQR: Generative Query Reformulation for Conversational Search",
+    "abstract": "In conversational search, the user\u2019s real search intent for the current conversation turn is dependent on the previous conversation history. It is challenging to determine a good search query from the whole conversation context. To avoid the expensive re-training of the query encoder, most existing methods try to learn a rewriting model to de-contextualize the current query by mimicking the manual query rewriting.However, manually rewritten queries are not always the best search queries.Thus, training a rewriting model on them would lead to sub-optimal queries. Another useful information to enhance the search query is the potential answer to the question. In this paper, we propose ConvGQR, a new framework to reformulate conversational queries based on generative pre-trained language models (PLMs), one for query rewriting and another for generating potential answers.By combining both, ConvGQR can produce better search queries.In addition, to relate query reformulation to the retrieval task, we propose a knowledge infusion mechanism to optimize both query reformulation and retrieval. Extensive experiments on four conversational search datasets demonstrate the effectiveness of ConvGQR.",
+    "authors": [
+      "Fengran Mo",
+      "Kelong Mao",
+      "Yutao Zhu",
+      "Yihong Wu",
+      "Kaiyu Huang",
+      "Jian-Yun Nie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.274",
+    "point2d": [
+      64.13304138183594,
+      24.327930450439453
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 276,
+    "title": "KILM: Knowledge Injection into Encoder-Decoder Language Models",
+    "abstract": "Large pre-trained language models (PLMs) have been shown to retain implicit knowledge within their parameters. To enhance this implicit knowledge, we propose Knowledge Injection into Language Models (KILM), a novel approach that injects entity-related knowledge into encoder-decoder PLMs, via a generative knowledge infilling objective through continued pre-training. This is done without architectural modifications to the PLMs or adding additional parameters. Experimental results over a suite of knowledge-intensive tasks spanning numerous datasets show that KILM enables models to retain more knowledge and hallucinate less while preserving their original performance on general NLU and NLG tasks. KILM also demonstrates improved zero-shot performances on tasks such as entity disambiguation, outperforming state-of-the-art models having 30x more parameters.",
+    "authors": [
+      "Yan Xu",
+      "Mahdi Namazifar",
+      "Devamanyu Hazarika",
+      "Aishwarya Padmakumar",
+      "Yang Liu",
+      "Dilek Hakkani-Tur"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.275",
+    "point2d": [
+      -23.584749221801758,
+      -27.59795570373535
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 277,
+    "title": "VSTAR: A Video-grounded Dialogue Dataset for Situated Semantic Understanding with Scene and Topic Transitions",
+    "abstract": "Video-grounded dialogue understanding is a challenging problem that requires machine to perceive, parse and reason over situated semantics extracted from weakly aligned video and dialogues. Most existing benchmarks treat both modalities the same as a frame-independent visual understanding task, while neglecting the intrinsic attributes in multimodal dialogues, such as scene and topic transitions. In this paper, we present Video-grounded Scene&Topic AwaRe dialogue (VSTAR) dataset, a large scale video-grounded dialogue understanding dataset based on 395 TV series. Based on VSTAR, we propose two benchmarks for video-grounded dialogue understanding: scene segmentation and topic segmentation, and one benchmark for video-grounded dialogue generation. Comprehensive experiments are performed on these benchmarks to demonstrate the importance of multimodal information and segments in video-grounded dialogue understanding and generation.",
+    "authors": [
+      "Yuxuan Wang",
+      "Zilong Zheng",
+      "Xueliang Zhao",
+      "Jinpeng Li",
+      "Yueqian Wang",
+      "Dongyan Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.276",
+    "point2d": [
+      8.158705711364746,
+      76.62556457519531
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 278,
+    "title": "NLPeer: A Unified Resource for the Computational Study of Peer Review",
+    "abstract": "Peer review constitutes a core component of scholarly publishing; yet it demands substantial expertise and training, and is susceptible to errors and biases. Various applications of NLP for peer reviewing assistance aim to support reviewers in this complex process, but the lack of clearly licensed datasets and multi-domain corpora prevent the systematic study of NLP for peer review. To remedy this, we introduce NLPeer\u2013 the first ethically sourced multidomain corpus of more than 5k papers and 11k review reports from five different venues. In addition to the new datasets of paper drafts, camera-ready versions and peer reviews from the NLP community, we establish a unified data representation and augment previous peer review datasets to include parsed and structured paper representations, rich metadata and versioning information. We complement our resource with implementations and analysis of three reviewing assistance tasks, including a novel guided skimming task.Our work paves the path towards systematic, multi-faceted, evidence-based study of peer review in NLP and beyond. The data and code are publicly available.",
+    "authors": [
+      "Nils Dycke",
+      "Ilia Kuznetsov",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.277",
+    "point2d": [
+      18.777053833007812,
+      17.435455322265625
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 279,
+    "title": "IM-TQA: A Chinese Table Question Answering Dataset with Implicit and Multi-type Table Structures",
+    "abstract": "Various datasets have been proposed to promote the development of Table Question Answering (TQA) technique. However, the problem setting of existing TQA benchmarks suffers from two limitations. First, they directly provide models with explicit table structures where row headers and column headers of the table are explicitly annotated and treated as model input during inference. Second, they only consider tables of limited types and ignore other tables especially complex tables with flexible header locations. Such simplified problem setting cannot cover practical scenarios where models need to process tables without header annotations in the inference phase or tables of different types. To address above issues, we construct a new TQA dataset with implicit and multi-type table structures, named IM-TQA, which not only requires the model to understand tables without directly available header annotations but also to handle multi-type tables including previously neglected complex tables. We investigate the performance of recent methods on our dataset and find that existing methods struggle in processing implicit and multi-type table structures. Correspondingly, we propose an RGCN-RCI framework outperforming recent baselines. We will release our dataset to facilitate future research.",
+    "authors": [
+      "Mingyu Zheng",
+      "Yang Hao",
+      "Wenbin Jiang",
+      "Zheng Lin",
+      "Yajuan Lyu",
+      "QiaoQiao She",
+      "Weiping Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.278",
+    "point2d": [
+      76.91822052001953,
+      5.619078159332275
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 280,
+    "title": "Z-Code++: A Pre-trained Language Model Optimized for Abstractive Summarization",
+    "abstract": "This paper presents Z-Code++, a new pre-trained language model optimized for abstractive text summarization. The model extends the state-of-the-art encoder-decoder model using three techniques. First, we use a two-phase pre-training to improve the model\u2019s performance on low-resource summarization tasks. The model is first pre-trained using text corpora for language understanding, then is continually pre-trained on summarization corpora for grounded text generation. Second, we replace self-attention layers in the encoder with disentangled attention layers, where each word is represented using two vectors that encode its content and position, respectively. Third, we use fusion-in-encoder, a simple yet effective method of encoding long sequences in a hierarchical manner. Z-Code++ createsa new state-of-the-art on 9 of 13 text summarization tasks across 5 languages. Our model is parameter-efficient in that it outperforms the 600x larger PaLM540B on XSum, and the finetuned 200x larger GPT3175B on SAMSum. In zero-shot and few-shot settings, our model substantially outperforms the competing models.",
+    "authors": [
+      "Pengcheng He",
+      "Baolin Peng",
+      "Song Wang",
+      "Yang Liu",
+      "Ruochen Xu",
+      "Hany Hassan",
+      "Yu Shi",
+      "Chenguang Zhu",
+      "Wayne Xiong",
+      "Michael Zeng",
+      "Jianfeng Gao",
+      "Xuedong Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.279",
+    "point2d": [
+      -13.447738647460938,
+      39.888641357421875
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 281,
+    "title": "Mixture-of-Domain-Adapters: Decoupling and Injecting Domain Knowledge to Pre-trained Language Models\u2019 Memories",
+    "abstract": "Pre-trained language models (PLMs) demonstrate excellent abilities to understand texts in the generic domain while struggling in a specific domain. Although continued pre-training on a large domain-specific corpus is effective, it is costly to tune all the parameters on the domain. In this paper, we investigate whether we can adapt PLMs both effectively and efficiently by only tuning a few parameters. Specifically, we decouple the feed-forward networks (FFNs) of the Transformer architecture into two parts: the original pre-trained FFNs to maintain the old-domain knowledge and our novel domain-specific adapters to inject domain-specific knowledge in parallel.Then we adopt a mixture-of-adapters gate to fuse the knowledge from different domain adapters dynamically. Our proposed Mixture-of-Domain-Adapters (MixDA) employs a two-stage adapter-tuning strategy that leverages both unlabeled data and labeled data to help the domain adaptation: i) domain-specific adapter on unlabeled data; followed by ii) the task-specific adapter on labeled data. MixDA can be seamlessly plugged into the pretraining-finetuning paradigm and our experiments demonstrate that MixDA achieves superior performance on in-domain tasks (GLUE), out-of-domain tasks (ChemProt, RCT, IMDB, Amazon), and knowledge-intensive tasks (KILT).Further analyses demonstrate the reliability, scalability, and efficiency of our method.",
+    "authors": [
+      "Shizhe Diao",
+      "Tianyang Xu",
+      "Ruijia Xu",
+      "Jiawei Wang",
+      "Tong Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.280",
+    "point2d": [
+      -25.035709381103516,
+      -26.499359130859375
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 282,
+    "title": "Unsupervised Graph-Text Mutual Conversion with a Unified Pretrained Language Model",
+    "abstract": "Graph-to-text (G2T) generation and text-to-graph (T2G) triple extraction are two essential tasks for knowledge graphs. Existing unsupervised approaches become suitable candidates for jointly learning the two tasks due to their avoidance of using graph-text parallel data. However, they adopt multiple complex modules and still require entity information or relation type for training. To this end, we propose INFINITY, a simple yet effective unsupervised method with a unified pretrained language model that does not introduce external annotation tools or additional parallel information. It achieves fully unsupervised graph-text mutual conversion for the first time. Specifically, INFINITY treats both G2T and T2G as a bidirectional sequence generation task by fine-tuning only one pretrained seq2seq model. A novel back-translation-based framework is then designed to generate synthetic parallel data automatically. Besides, we investigate the impact of graph linearization and introduce the structure-aware fine-tuning strategy to alleviate possible performance deterioration via retaining structural information in graph sequences. As a fully unsupervised framework, INFINITY is empirically verified to outperform state-of-the-art baselines for G2T and T2G tasks. Additionally, we also devise a new training setting called cross learning for low-resource unsupervised information extraction.",
+    "authors": [
+      "Yi Xu",
+      "Shuqian Sheng",
+      "Jiexing Qi",
+      "Luoyi Fu",
+      "Zhouhan Lin",
+      "Xinbing Wang",
+      "Chenghu Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.281",
+    "point2d": [
+      43.607200622558594,
+      -68.20931243896484
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 283,
+    "title": "Randomized Smoothing with Masked Inference for Adversarially Robust Text Classifications",
+    "abstract": "Large-scale pre-trained language models have shown outstanding performance in a variety of NLP tasks. However, they are also known to be significantly brittle against specifically crafted adversarial examples, leading to increasing interest in probing the adversarial robustness of NLP systems. We introduce RSMI, a novel two-stage framework that combines randomized smoothing (RS) with masked inference (MI) to improve the adversarial robustness of NLP systems. RS transforms a classifier into a smoothed classifier to obtain robust representations, whereas MI forces a model to exploit the surrounding context of a masked token in an input sequence. RSMI improves adversarial robustness by 2 to 3 times over existing state-of-the-art methods on benchmark datasets. We also perform in-depth qualitative analysis to validate the effectiveness of the different stages of RSMI and probe the impact of its components through extensive ablations. By empirically proving the stability of RSMI, we put it forward as a practical method to robustly train large-scale NLP models. Our code and datasets are available at https://github.com/Han8931/rsmi_nlp",
+    "authors": [
+      "Han Cheol Moon",
+      "Shafiq Joty",
+      "Ruochen Zhao",
+      "Megh Thakkar",
+      "Chi Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.282",
+    "point2d": [
+      2.4339993000030518,
+      6.608603477478027
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 284,
+    "title": "SESCORE2: Learning Text Generation Evaluation via Synthesizing Realistic Mistakes",
+    "abstract": "Is it possible to train a general metric for evaluating text generation quality without human-annotated ratings? Existing learned metrics either perform unsatisfactory across text generation tasks or require human ratings for training on specific tasks. In this paper, we propose SEScore2, a self-supervised approach for training a model-based metric for text generation evaluation. The key concept is to synthesize realistic model mistakes by perturbing sentences retrieved from a corpus. We evaluate SEScore2 and previous methods on four text generation tasks across three languages. SEScore2 outperforms all prior unsupervised metrics on four text generation evaluation benchmarks, with an average Kendall improvement of 0.158. Surprisingly, SEScore2 even outperforms the supervised BLEURT and COMET on multiple text generation tasks.",
+    "authors": [
+      "Wenda Xu",
+      "Xian Qian",
+      "Mingxuan Wang",
+      "Lei Li",
+      "William Yang Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.283",
+    "point2d": [
+      -19.079069137573242,
+      15.542612075805664
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 285,
+    "title": "Tokenization and the Noiseless Channel",
+    "abstract": "Subword tokenization is a key part of most NLP pipelines.However, little is known about why some tokenizer and hyperparameter combinations lead to improved downstream model performance over others. We propose that good tokenizers lead to efficient channel usage, where the channel is the means by which some input is conveyed to the model and efficiency can be quantified in information-theoretic terms as the ratio of the Shannon entropy to the maximum entropy of the subword distribution.Nevertheless, an optimal encoding according to Shannon entropy assigns extremely long codes to low-frequency subwords and very short codes to high-frequency subwords.Defining efficiency in terms of R\u00e9nyi entropy, on the other hand, penalizes distributions with either very high or very low-frequency subwords.We posit that (1) extremely high-frequency subwords are problematic because their meaning is not distinct and (2) that low-frequency subwords may not appear frequently enough for their meaning to be learned properly; encodings that induce unigram distributions with either can harm model performance.In machine translation, we find that across multiple tokenizers, the R\u00e9nyi entropy has a very strong correlation with BLEU: 0.82 in comparison to just -0.30 for compressed length.",
+    "authors": [
+      "Vil\u00e9m Zouhar",
+      "Clara Meister",
+      "Juan Gastaldi",
+      "Li Du",
+      "Mrinmaya Sachan",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.284",
+    "point2d": [
+      -39.71098709106445,
+      -4.542628288269043
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 286,
+    "title": "Contextual Distortion Reveals Constituency: Masked Language Models are Implicit Parsers",
+    "abstract": "Recent advancements in pre-trained language models (PLMs) have demonstrated that these models possess some degree of syntactic awareness. To leverage this knowledge, we propose a novel chart-based method for extracting parse trees from masked language models (LMs) without the need to train separate parsers. Our method computes a score for each span based on the distortion of contextual representations resulting from linguistic perturbations. We design a set of perturbations motivated by the linguistic concept of constituency tests, and use these to score each span by aggregating the distortion scores. To produce a parse tree, we use chart parsing to find the tree with the minimum score. Our method consistently outperforms previous state-of-the-art methods on English with masked LMs, and also demonstrates superior performance in a multilingual setting, outperforming the state-of-the-art in 6 out of 8 languages. Notably, although our method does not involve parameter updates or extensive hyperparameter search, its performance can even surpass some unsupervised parsing methods that require fine-tuning. Our analysis highlights that the distortion of contextual representation resulting from syntactic perturbation can serve as an effective indicator of constituency across languages.",
+    "authors": [
+      "Jiaxi Li",
+      "Wei Lu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.285",
+    "point2d": [
+      -23.348247528076172,
+      -63.094635009765625
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 287,
+    "title": "MetaAdapt: Domain Adaptive Few-Shot Misinformation Detection via Meta Learning",
+    "abstract": "With emerging topics (e.g., COVID-19) on social media as a source for the spreading misinformation, overcoming the distributional shifts between the original training domain (i.e., source domain) and such target domains remains a non-trivial task for misinformation detection. This presents an elusive challenge for early-stage misinformation detection, where a good amount of data and annotations from the target domain is not available for training. To address the data scarcity issue, we propose MetaAdapt, a meta learning based approach for domain adaptive few-shot misinformation detection. MetaAdapt leverages limited target examples to provide feedback and guide the knowledge transfer from the source to the target domain (i.e., learn to adapt). In particular, we train the initial model with multiple source tasks and compute their similarity scores to the meta task. Based on the similarity scores, we rescale the meta gradients to adaptively learn from the source tasks. As such, MetaAdapt can learn how to adapt the misinformation detection model and exploit the source data for improved performance in the target domain. To demonstrate the efficiency and effectiveness of our method, we perform extensive experiments to compare MetaAdapt with state-of-the-art baselines and large language models (LLMs) such as LLaMA, where MetaAdapt achieves better performance in domain adaptive few-shot misinformation detection with substantially reduced parameters on real-world datasets.",
+    "authors": [
+      "Zhenrui Yue",
+      "Huimin Zeng",
+      "Yang Zhang",
+      "Lanyu Shang",
+      "Dong Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.286",
+    "point2d": [
+      -5.691811561584473,
+      -6.872772216796875
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 288,
+    "title": "Tackling Modality Heterogeneity with Multi-View Calibration Network for Multimodal Sentiment Detection",
+    "abstract": "With the popularity of social media, detecting sentiment from multimodal posts (e.g. image-text pairs) has attracted substantial attention recently. Existing works mainly focus on fusing different features but ignore the challenge of modality heterogeneity. Specifically, different modalities with inherent disparities may bring three problems: 1) introducing redundant visual features during feature fusion; 2) causing feature shift in the representation space; 3) leading to inconsistent annotations for different modal data. All these issues will increase the difficulty in understanding the sentiment of the multimodal content. In this paper, we propose a novel Multi-View Calibration Network (MVCN) to alleviate the above issues systematically. We first propose a text-guided fusion module with novel Sparse-Attention to reduce the negative impacts of redundant visual elements. We then devise a sentiment-based congruity constraint task to calibrate the feature shift in the representation space. Finally, we introduce an adaptive loss calibration strategy to tackle inconsistent annotated labels. Extensive experiments demonstrate the competitiveness of MVCN against previous approaches and achieve state-of-the-art results on two public benchmark datasets.",
+    "authors": [
+      "Yiwei Wei",
+      "Shaozu Yuan",
+      "Ruosong Yang",
+      "Lei Shen",
+      "Zhangmeizhi Li",
+      "Longbiao Wang",
+      "Meng Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.287",
+    "point2d": [
+      -40.76618194580078,
+      58.44348907470703
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 289,
+    "title": "COLA: Contextualized Commonsense Causal Reasoning from the Causal Inference Perspective",
+    "abstract": "Detecting commonsense causal relations (causation) between events has long been an essential yet challenging task. Given that events are complicated, an event may have different causes under various contexts. Thus, exploiting context plays an essential role in detecting causal relations. Meanwhile, previous works about commonsense causation only consider two events and ignore their context, simplifying the task formulation. This paper proposes a new task to detect commonsense causation between two events in an event sequence (i.e., context), called contextualized commonsense causal reasoning. We also design a zero-shot framework: COLA (Contextualized Commonsense Causality Reasoner) to solve the task from the causal inference perspective. This framework obtains rich incidental supervision from temporality and balances covariates from multiple timestamps to remove confounding effects. Our extensive experiments show that COLA can detect commonsense causality more accurately than baselines.",
+    "authors": [
+      "Zhaowei Wang",
+      "Quyet V. Do",
+      "Hongming Zhang",
+      "Jiayao Zhang",
+      "Weiqi Wang",
+      "Tianqing Fang",
+      "Yangqiu Song",
+      "Ginny Wong",
+      "Simon See"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.288",
+    "point2d": [
+      52.11423873901367,
+      -46.417423248291016
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 290,
+    "title": "MEMEX: Detecting Explanatory Evidence for Memes via Knowledge-Enriched Contextualization",
+    "abstract": "Memes are a powerful tool for communication over social media. Their affinity for evolving across politics, history, and sociocultural phenomena renders them an ideal vehicle for communication. To comprehend the subtle message conveyed within a meme, one must understand the relevant background that facilitates its holistic assimilation. Besides digital archiving of memes and their metadata by a few websites like knowyourmeme.com, currently, there is no efficient way to deduce a meme\u2019s context dynamically. In this work, we propose a novel task, MEMEX - given a meme and a related document, the aim is to mine the context that succinctly explains the background of the meme. At first, we develop MCC (Meme Context Corpus), a novel dataset for MEMEX. Further, to benchmark MCC, we propose MIME (MultImodal Meme Explainer), a multimodal neural framework that uses external knowledge-enriched meme representation and a multi-level approach to capture the cross-modal semantic dependencies between the meme and the context. MIME surpasses several unimodal and multimodal systems and yields an absolute improvement of 4% F1-score over the best baseline. Lastly, we conduct detailed analyses of MIME\u2019s performance, highlighting the aspects that could lead to optimal modeling of cross-modal contextual associations.",
+    "authors": [
+      "Shivam Sharma",
+      "Ramaneswaran S",
+      "Udit Arora",
+      "Md. Shad Akhtar",
+      "Tanmoy Chakraborty"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.289",
+    "point2d": [
+      10.560933113098145,
+      52.5001106262207
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 291,
+    "title": "WikiHowQA: A Comprehensive Benchmark for Multi-Document Non-Factoid Question Answering",
+    "abstract": "Answering non-factoid questions (NFQA) is a challenging task, requiring passage-level answers that are difficult to construct and evaluate. Search engines may provide a summary of a single web page, but many questions require reasoning across multiple documents. Meanwhile, modern models can generate highly coherent and fluent, but often factually incorrect answers that can deceive even non-expert humans. There is a critical need for high-quality resources for multi-document NFQA (MD-NFQA) to train new models and evaluate answers\u2019 grounding and factual consistency in relation to supporting documents.To address this gap, we introduce WikiHowQA, a new multi-document NFQA benchmark built on WikiHow, a website dedicated to answering \u201chow-to\u201d questions. The benchmark includes 11,746 human-written answers along with 74,527 supporting documents. We describe the unique challenges of the resource, provide strong baselines, and propose a novel human evaluation framework that utilizes highlighted relevant supporting passages to mitigate issues such as assessor unfamiliarity with the question topic. All code and data, including the automatic code for preparing the human evaluation, are publicly available.",
+    "authors": [
+      "Valeriia Bolotova-Baranova",
+      "Vladislav Blinov",
+      "Sofya Filippova",
+      "Falk Scholer",
+      "Mark Sanderson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.290",
+    "point2d": [
+      68.61624145507812,
+      9.14599895477295
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 292,
+    "title": "Making Language Models Better Reasoners with Step-Aware Verifier",
+    "abstract": "Few-shot learning is a challenging task that requires language models to generalize from limited examples. Large language models like GPT-3 and PaLM have made impressive progress in this area, but they still face difficulties in reasoning tasks such as GSM8K, a benchmark for arithmetic problems. To improve their reasoning skills, previous work has proposed to guide the language model with prompts that elicit a series of reasoning steps before giving the final answer, achieving a significant improvement on GSM8K from 17.9% to 58.1% in problem-solving rate. In this paper, we present DiVeRSe (Diverse Verifier on Reasoning Step), a novel approach that further enhances the reasoning capability of language models. DiVeRSe has three main components: first, it generates diverse prompts to explore different reasoning paths for the same question; second, it uses a verifier to filter out incorrect answers based on a weighted voting scheme; and third, it verifies each reasoning step individually instead of the whole chain. We evaluate DiVeRSe on the latest language model code-davinci-002 and show that it achieves new state-of-the-art results on six of eight reasoning benchmarks (e.g., GSM8K 74.4% to 83.2%).",
+    "authors": [
+      "Yifei Li",
+      "Zeqi Lin",
+      "Shizhuo Zhang",
+      "Qiang Fu",
+      "Bei Chen",
+      "Jian-Guang Lou",
+      "Weizhu Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.291",
+    "point2d": [
+      46.20414352416992,
+      -16.209062576293945
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 293,
+    "title": "Distributed Marker Representation for Ambiguous Discourse Markers and Entangled Relations",
+    "abstract": "Discourse analysis is an important task because it models intrinsic semantic structures between sentences in a document. Discourse markers are natural representations of discourse in our daily language. One challenge is that the markers as well as pre-defined and human-labeled discourse relations can be ambiguous when describing the semantics between sentences. We believe that a better approach is to use a contextual-dependent distribution over the markers to express discourse information. In this work, we propose to learn a Distributed Marker Representation (DMR) by utilizing the (potentially) unlimited discourse marker data with a latent discourse sense, thereby bridging markers with sentence pairs. Such representations can be learned automatically from data without supervision, and in turn provide insights into the data itself. Experiments show the SOTA performance of our DMR on the implicit discourse relation recognition task and strong interpretability. Our method also offers a valuable tool to understand complex ambiguity and entanglement among discourse markers and manually defined discourse relations.",
+    "authors": [
+      "Dongyu Ru",
+      "Lin Qiu",
+      "Xipeng Qiu",
+      "Yue Zhang",
+      "Zheng Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.292",
+    "point2d": [
+      15.96812915802002,
+      -51.89329528808594
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 294,
+    "title": "MISGENDERED: Limits of Large Language Models in Understanding Pronouns",
+    "abstract": "Content Warning: This paper contains examples of misgendering and erasure that could be offensive and potentially triggering.Gender bias in language technologies has been widely studied, but research has mostly been restricted to a binary paradigm of gender. It is essential also to consider non-binary gender identities, as excluding them can cause further harm to an already marginalized group. In this paper, we comprehensively evaluate popular language models for their ability to correctly use English gender-neutral pronouns (e.g., singular they, them) and neo-pronouns (e.g., ze, xe, thon) that are used by individuals whose gender identity is not represented by binary pronouns. We introduce Misgendered, a framework for evaluating large language models\u2019 ability to correctly use preferred pronouns, consisting of (i) instances declaring an individual\u2019s pronoun, followed by a sentence with a missing pronoun, and (ii) an experimental setup for evaluating masked and auto-regressive language models using a unified method. When prompted out-of-the-box, language models perform poorly at correctly predicting neo-pronouns (averaging 7.6% accuracy) and gender-neutral pronouns (averaging 31.0% accuracy). This inability to generalize results from a lack of representation of non-binary pronouns in training data and memorized associations. Few-shot adaptation with explicit examples in the prompt improves the performance but plateaus at only 45.4% for neo-pronouns. We release the full dataset, code, and demo at https://tamannahossainkay.github.io/misgendered/.",
+    "authors": [
+      "Tamanna Hossain",
+      "Sunipa Dev",
+      "Sameer Singh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.293",
+    "point2d": [
+      11.35046100616455,
+      30.334741592407227
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 295,
+    "title": "Reasoning with Language Model Prompting: A Survey",
+    "abstract": "Reasoning, as an essential ability for complex problem-solving, can provide back-end support for various real-world applications, such as medical diagnosis, negotiation, etc. This paper provides a comprehensive survey of cutting-edge research on reasoning with language model prompting. We introduce research works with comparisons and summaries and provide systematic resources to help beginners. We also discuss the potential reasons for emerging such reasoning abilities and highlight future research directions. Resources are available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated periodically).",
+    "authors": [
+      "Shuofei Qiao",
+      "Yixin Ou",
+      "Ningyu Zhang",
+      "Xiang Chen",
+      "Yunzhi Yao",
+      "Shumin Deng",
+      "Chuanqi Tan",
+      "Fei Huang",
+      "Huajun Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.294",
+    "point2d": [
+      47.73463439941406,
+      -10.830438613891602
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 296,
+    "title": "Tackling Ambiguity with Images: Improved Multimodal Machine Translation and Contrastive Evaluation",
+    "abstract": "One of the major challenges of machine translation (MT) is ambiguity, which can in some cases be resolved by accompanying context such as images. However, recent work in multimodal MT (MMT) has shown that obtaining improvements from images is challenging, limited not only by the difficulty of building effective cross-modal representations, but also by the lack of specific evaluation and training data. We present a new MMT approach based on a strong text-only MT model, which uses neural adapters, a novel guided self-attention mechanism and which is jointly trained on both visually-conditioned masking and MMT. We also introduce CoMMuTE, a Contrastive Multilingual Multimodal Translation Evaluation set of ambiguous sentences and their possible translations, accompanied by disambiguating images corresponding to each translation. Our approach obtains competitive results compared to strong text-only models on standard English\u2192French, English\u2192German and English\u2192Czech benchmarks and outperforms baselines and state-of-the-art MMT systems by a large margin on our contrastive test set. Our code and CoMMuTE are freely available.",
+    "authors": [
+      "Matthieu Futeral",
+      "Cordelia Schmid",
+      "Ivan Laptev",
+      "Beno\u00eet Sagot",
+      "Rachel Bawden"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.295",
+    "point2d": [
+      -67.59881591796875,
+      38.93305969238281
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 297,
+    "title": "Hybrid Knowledge Transfer for Improved Cross-Lingual Event Detection via Hierarchical Sample Selection",
+    "abstract": "In this paper, we address the Event Detection task under a zero-shot cross-lingual setting where a model is trained on a source language but evaluated on a distinct target language for which there is no labeled data available. Most recent efforts in this field follow a direct transfer approach in which the model is trained using language-invariant features and then directly applied to the target language. However, we argue that these methods fail to take advantage of the benefits of the data transfer approach where a cross-lingual model is trained on target-language data and is able to learn task-specific information from syntactical features or word-label relations in the target language. As such, we propose a hybrid knowledge-transfer approach that leverages a teacher-student framework where the teacher and student networks are trained following the direct and data transfer approaches, respectively. Our method is complemented by a hierarchical training-sample selection scheme designed to address the issue of noisy labels being generated by the teacher model. Our model achieves state-of-the-art results on 9 morphologically-diverse target languages across 3 distinct datasets, highlighting the importance of exploiting the benefits of hybrid transfer.",
+    "authors": [
+      "Luis Guzman Nateras",
+      "Franck Dernoncourt",
+      "Thien Nguyen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.296",
+    "point2d": [
+      50.60573959350586,
+      -35.770328521728516
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 298,
+    "title": "BLEURT Has Universal Translations: An Analysis of Automatic Metrics by Minimum Risk Training",
+    "abstract": "Automatic metrics play a crucial role in machine translation. Despite the widespread use of n-gram-based metrics, there has been a recent surge in the development of pre-trained model-based metrics that focus on measuring sentence semantics. However, these neural metrics, while achieving higher correlations with human evaluations, are often considered to be black boxes with potential biases that are difficult to detect. In this study, we systematically analyze and compare various mainstream and cutting-edge automatic metrics from the perspective of their guidance for training machine translation systems. Through Minimum Risk Training (MRT), we find that certain metrics exhibit robustness defects, such as the presence of universal adversarial translations in BLEURT and BARTScore. In-depth analysis suggests two main causes of these robustness deficits: distribution biases in the training datasets, and the tendency of the metric paradigm. By incorporating token-level constraints, we enhance the robustness of evaluation metrics, which in turn leads to an improvement in the performance of machine translation systems. Codes are available at https://github.com/powerpuffpomelo/fairseq_mrt.",
+    "authors": [
+      "Yiming Yan",
+      "Tao Wang",
+      "Chengqi Zhao",
+      "Shujian Huang",
+      "Jiajun Chen",
+      "Mingxuan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.297",
+    "point2d": [
+      -74.08268737792969,
+      -4.583806991577148
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 299,
+    "title": "Cross-modal Attention Congruence Regularization for Vision-Language Relation Alignment",
+    "abstract": "Despite recent progress towards scaling up multimodal vision-language models, these models are still known to struggle on compositional generalization benchmarks such as Winoground. We find that a critical component lacking from current vision-language models is relation-level alignment: the ability to match directional semantic relations in text (e.g., \u2018mug in grass\u2019) with spatial relationships in the image (e.g., the position of the mug relative to the grass). To tackle this problem, we show that relation alignment can be enforced by encouraging the language attention from \u2018mug\u2019 to \u2018grass\u2019 (capturing the semantic relation \u2018in\u2019) to match the visual attention from the mug to the grass (capturing the corresponding physical relation). Tokens and their corresponding objects are softly identified using a weighted mean of cross-modal attention. We prove that this notion of soft cross-modal equivalence is equivalent to enforcing congruence between vision and language attention matrices under a \u2018change of basis\u2019 provided by the cross-modal attention matrix. Intuitively, our approach projects visual attention into the language attention space to calculate its divergence from the actual language attention, and vice versa. We apply our Cross-modal Attention Congruence Regularization (CACR) loss to fine-tune UNITER and improve its Winoground Group score by 5.75 points.",
+    "authors": [
+      "Rohan Pandey",
+      "Rulin Shao",
+      "Paul Pu Liang",
+      "Ruslan Salakhutdinov",
+      "Louis-Philippe Morency"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.298",
+    "point2d": [
+      -57.57532501220703,
+      32.50359344482422
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 300,
+    "title": "Enhancing Personalized Dialogue Generation with Contrastive Latent Variables: Combining Sparse and Dense Persona",
+    "abstract": "The personalized dialogue explores the consistent relationship between dialogue generation and personality. Existing personalized dialogue agents model persona profiles from three resources: sparse or dense persona descriptions and dialogue histories. However, sparse structured persona attributes are explicit but uninformative, dense persona texts contain rich persona descriptions with much noise, and dialogue history query is both noisy and uninformative for persona modeling. In this work, we combine the advantages of the three resources to obtain a richer and more accurate persona. We design a Contrastive Latent Variable-based model (CLV) that clusters the dense persona descriptions into sparse categories, which are combined with the history query to generate personalized responses. Experimental results on Chinese and English datasets demonstrate our model\u2019s superiority in personalization.",
+    "authors": [
+      "Yihong Tang",
+      "Bo Wang",
+      "Miao Fang",
+      "Dongming Zhao",
+      "Kun Huang",
+      "Ruifang He",
+      "Yuexian Hou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.299",
+    "point2d": [
+      12.477426528930664,
+      73.17912292480469
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 301,
+    "title": "Can LMs Learn New Entities from Descriptions? Challenges in Propagating Injected Knowledge",
+    "abstract": "Pre-trained language models (LMs) are used for knowledge intensive tasks like question answering, but their knowledge gets continuously outdated as the world changes. Prior work has studied targeted updates to LMs, injecting individual facts and evaluating whether the model learns these facts while not changing predictions on other contexts. We take a step forward and study LMs\u2019 abilities to make inferences based on injected facts (or propagate those facts): for example, after learning that something is a TV show, does an LM predict that you can watch it? We study this with two cloze-style tasks: an existing dataset of real-world sentences about novel entities (ECBD) as well as a new controlled benchmark with manually designed templates requiring varying levels of inference about injected knowledge. Surprisingly, we find that existing methods for updating knowledge (gradient-based fine-tuning and modifications of this approach) show little propagation of injected knowledge. These methods improve performance on cloze instances only when there is lexical overlap between injected facts and target inferences. Yet, prepending entity definitions in an LM\u2019s context improves performance across all settings, suggesting that there is substantial headroom for parameter-updating approaches for knowledge injection.",
+    "authors": [
+      "Yasumasa Onoe",
+      "Michael Zhang",
+      "Shankar Padmanabhan",
+      "Greg Durrett",
+      "Eunsol Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.300",
+    "point2d": [
+      45.426124572753906,
+      1.9029933214187622
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 302,
+    "title": "Explaining How Transformers Use Context to Build Predictions",
+    "abstract": "Language Generation Models produce words based on the previous context. Although existing methods offer input attributions as explanations for a model\u2019s prediction, it is still unclear how prior words affect the model\u2019s decision throughout the layers. In this work, we leverage recent advances in explainability of the Transformer and present a procedure to analyze models for language generation. Using contrastive examples, we compare the alignment of our explanations with evidence of the linguistic phenomena, and show that our method consistently aligns better than gradient-based and perturbation-based baselines. Then, we investigate the role of MLPs inside the Transformer and show that they learn features that help the model predict words that are grammatically acceptable. Lastly, we apply our method to Neural Machine Translation models, and demonstrate that they generate human-like source-target alignments for building predictions.",
+    "authors": [
+      "Javier Ferrando",
+      "Gerard I. G\u00e1llego",
+      "Ioannis Tsiamas",
+      "Marta R. Costa-juss\u00e0"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.301",
+    "point2d": [
+      -25.06196403503418,
+      13.112283706665039
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 303,
+    "title": "DISCO: Distilling Counterfactuals with Large Language Models",
+    "abstract": "Models trained with counterfactually augmented data learn representations of the causal structure of tasks, enabling robust generalization. However, high-quality counterfactual data is scarce for most tasks and not easily generated at scale. When crowdsourced, such data is typically limited in scale and diversity; when generated using supervised methods, it is computationally expensive to extend to new counterfactual dimensions. In this work, we introduce DISCO (DIStilled COunterfactual Data), a new method for automatically generating high-quality counterfactual data at scale. DISCO engineers prompts to generate phrasal perturbations with a large general language model. Then, a task-specific teacher model filters these generations to distill high-quality counterfactual data. While task-agnostic, we apply our pipeline to the task of natural language inference (NLI) and find that on challenging evaluations such as the NLI stress test, comparatively smaller student models trained with DISCO generated counterfactuals are more robust (6% absolute) and generalize better across distributions (2%) compared to models trained without data augmentation. Furthermore, DISCO augmented models are 10% more consistent between counterfactual pairs on three evaluation sets, demonstrating that DISCO augmentation enables models to more reliably learn causal representations. Our repository are available at: https://github.com/eric11eca/disco",
+    "authors": [
+      "Zeming Chen",
+      "Qiyue Gao",
+      "Antoine Bosselut",
+      "Ashish Sabharwal",
+      "Kyle Richardson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.302",
+    "point2d": [
+      30.011009216308594,
+      -2.638862371444702
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 304,
+    "title": "Non-Sequential Graph Script Induction via Multimedia Grounding",
+    "abstract": "Online resources such as WikiHow compile a wide range of scripts for performing everyday tasks, which can assist models in learning to reason about procedures. However, the scripts are always presented in a linear manner, which does not reflect the flexibility displayed by people executing tasks in real life. For example, in the CrossTask Dataset, 64.5% of consecutive step pairs are also observed in the reverse order, suggesting their ordering is not fixed. In addition, each step has an average of 2.56 frequent next steps, demonstrating \u201cbranching\u201d. In this paper, we propose the new challenging task of non-sequential graph script induction, aiming to capture optional and interchangeable steps in procedural planning. To automate the induction of such graph scripts for given tasks, we propose to take advantage of loosely aligned videos of people performing the tasks. In particular, we design a multimodal framework to ground procedural videos to WikiHow textual steps and thus transform each video into an observed step path on the latent ground truth graph script. This key transformation enables us to train a script knowledge model capable of both generating explicit graph scripts for learnt tasks and predicting future steps given a partial step sequence. Our best model outperforms the strongest pure text/vision baselines by 17.52% absolute gains on F1@3 for next step prediction and 13.8% absolute gains on Acc@1 for partial sequence completion. Human evaluation shows our model outperforming the WikiHow linear baseline by 48.76% absolute gains in capturing sequential and non-sequential step relationships.",
+    "authors": [
+      "Yu Zhou",
+      "Sha Li",
+      "Manling Li",
+      "Xudong Lin",
+      "Shih-Fu Chang",
+      "Mohit Bansal",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.303",
+    "point2d": [
+      55.14460372924805,
+      -9.658673286437988
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 305,
+    "title": "SCOTT: Self-Consistent Chain-of-Thought Distillation",
+    "abstract": "Large language models (LMs) beyond a certain scale, demonstrate the emergent capability of generating free-text rationales for their predictions via chain-of-thought (CoT) prompting.While CoT can yield dramatically improved performance, such gains are only observed for sufficiently large LMs. Even more concerning, there is little guarantee that the generated rationales are consistent with LM\u2019s predictions or faithfully justify the decisions. In this work, we propose SCOTT, a faithful knowledge distillation method to learn a small, self-consistent CoT model from a teacher model that is orders of magnitude larger. To form better supervision, we elicit rationales supporting the gold answers from a large LM (teacher) by contrastive decoding, which encourages the teacher to generate tokens that become more plausible only when the answer is considered. To ensure faithful distillation, we use the teacher-generated rationales to learn a student LM with a counterfactual reasoning objective, which prevents the student from ignoring the rationales to make inconsistent predictions. Experiments show that while yielding comparable performance, our method leads to a more faithful model than baselines. Further analysis shows that such a model respects the rationales more when making decisions; thus, we can improve its performance more by refining its rationales.",
+    "authors": [
+      "Peifeng Wang",
+      "Zhengyang Wang",
+      "Zheng Li",
+      "Yifan Gao",
+      "Bing Yin",
+      "Xiang Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.304",
+    "point2d": [
+      37.8828239440918,
+      -11.25064468383789
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 306,
+    "title": "Clinical Note Owns its Hierarchy: Multi-Level Hypergraph Neural Networks for Patient-Level Representation Learning",
+    "abstract": "Leveraging knowledge from electronic health records (EHRs) to predict a patient\u2019s condition is essential to the effective delivery of appropriate care. Clinical notes of patient EHRs contain valuable information from healthcare professionals, but have been underused due to their difficult contents and complex hierarchies. Recently, hypergraph-based methods have been proposed for document classifications. Directly adopting existing hypergraph methods on clinical notes cannot sufficiently utilize the hierarchy information of the patient, which can degrade clinical semantic information by (1) frequent neutral words and (2) hierarchies with imbalanced distribution. Thus, we propose a taxonomy-aware multi-level hypergraph neural network (TM-HGNN), where multi-level hypergraphs assemble useful neutral words with rare keywords via note and taxonomy level hyperedges to retain the clinical semantic information. The constructed patient hypergraphs are fed into hierarchical message passing layers for learning more balanced multi-level knowledge at the note and taxonomy levels. We validate the effectiveness of TM-HGNN by conducting extensive experiments with MIMIC-III dataset on benchmark in-hospital-mortality prediction.",
+    "authors": [
+      "Nayeon Kim",
+      "Yinhua Piao",
+      "Sun Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.305",
+    "point2d": [
+      32.35185241699219,
+      -42.27449035644531
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 307,
+    "title": "Incorporating Distributions of Discourse Structure for Long Document Abstractive Summarization",
+    "abstract": "For text summarization, the role of discourse structure is pivotal in discerning the core content of a text. Regrettably, prior studies on incorporating Rhetorical Structure Theory (RST) into transformer-based summarization models only consider the nuclearity annotation, thereby overlooking the variety of discourse relation types. This paper introduces the \u2018RSTformer\u2019, a novel summarization model that comprehensively incorporates both the types and uncertainty of rhetorical relations. Our RST-attention mechanism, rooted in document-level rhetorical structure, is an extension of the recently devised Longformer framework. Through rigorous evaluation, the model proposed herein exhibits significant superiority over state-of-the-art models, as evidenced by its notable performance on several automatic metrics and human evaluation.",
+    "authors": [
+      "Dongqi Pu",
+      "Yifan Wang",
+      "Vera Demberg"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.306",
+    "point2d": [
+      -3.191396474838257,
+      41.99003982543945
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 308,
+    "title": "Evaluating Open-Domain Question Answering in the Era of Large Language Models",
+    "abstract": "Lexical matching remains the de facto evaluation method for open-domain question answering (QA). Unfortunately, lexical matching fails completely when a plausible candidate answer does not appear in the list of gold answers, which is increasingly the case as we shift from extractive to generative models. The recent success of large language models (LLMs) for QA aggravates lexical matching failures since candidate answers become longer, thereby making matching with the gold answers even more challenging. Without accurate evaluation, the true progress in open-domain QA remains unknown. In this paper, we conduct a thorough analysis of various open-domain QA models, including LLMs, by manually evaluating their answers on a subset of NQ-open, a popular benchmark. Our assessments reveal that while the true performance of all models is significantly underestimated, the performance of the InstructGPT (zero-shot) LLM increases by nearly +60%, making it on par with existing top models, and the InstructGPT (few-shot) model actually achieves a new state-of-the-art on NQ-open. We also find that more than 50% of lexical matching failures are attributed to semantically equivalent answers. We further demonstrate that regex matching ranks QA models consistent with human judgments, although still suffering from unnecessary strictness. Finally, we demonstrate that automated evaluation models are a reasonable surrogate for lexical matching in some circumstances, but not for long-form answers generated by LLMs. The automated models struggle in detecting hallucinations in LLM answers and are thus unable to evaluate LLMs. At this time, there appears to be no substitute for human evaluation.",
+    "authors": [
+      "Ehsan Kamalloo",
+      "Nouha Dziri",
+      "Charles Clarke",
+      "Davood Rafiei"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.307",
+    "point2d": [
+      65.07183837890625,
+      11.480204582214355
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 309,
+    "title": "No clues good clues: out of context Lexical Relation Classification",
+    "abstract": "The accurate prediction of lexical relations between words is a challenging task in Natural Language Processing (NLP). The most recent advances in this direction come with the use of pre-trained language models (PTLMs). A PTLM typically needs \u201cwell-formed\u201d verbalized text to interact with it, either to fine-tune it or to exploit it. However, there are indications that commonly used PTLMs already encode enough linguistic knowledge to allow the use of minimal (or none) textual context for some linguistically motivated tasks, thus notably reducing human effort, the need for data pre-processing, and favoring techniques that are language neutral since do not rely on syntactic structures. In this work, we explore this idea for the tasks of lexical relation classification (LRC) and graded Lexical Entailment (LE). After fine-tuning PTLMs for LRC with different verbalizations, our evaluation results show that very simple prompts are competitive for LRC and significantly outperform graded LE SoTA. In order to gain a better insight into this phenomenon, we perform a number of quantitative statistical analyses on the results, as well as a qualitative visual exploration based on embedding projections.",
+    "authors": [
+      "Lucia Pitarch",
+      "Jordi Bernad",
+      "Lacramioara Dranca",
+      "Carlos Bobed Lisbona",
+      "Jorge Gracia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.308",
+    "point2d": [
+      14.177690505981445,
+      -49.89826583862305
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 310,
+    "title": "Won\u2019t Get Fooled Again: Answering Questions with False Premises",
+    "abstract": "Pre-trained language models (PLMs) have shown unprecedented potential in various fields, especially as the backbones for question-answering (QA) systems. However, they tend to be easily deceived by tricky questions such as \u201cHow many eyes does the sun have?\u201d. Such frailties of PLMs often allude to the lack of knowledge within them. In this paper, we find that the PLMs already possess the knowledge required to rebut such questions, and the key is how to activate the knowledge. To systematize this observation, we investigate the PLMs\u2019 responses to one kind of tricky questions, i.e., the false premises questions (FPQs). We annotate a FalseQA dataset containing 2365 human-written FPQs, with the corresponding explanations for the false premises and the revised true premise questions. Using FalseQA, we discover that PLMs are capable of discriminating FPQs by fine-tuning on moderate numbers (e.g., 256) of examples. PLMs also generate reasonable explanations for the false premise, which serve as rebuttals. Further replaying a few general questions during training allows PLMs to excel on FPQs and general questions simultaneously. Our work suggests that once the rebuttal ability is stimulated, knowledge inside the PLMs can be effectively utilized to handle FPQs, which incentivizes the research on PLM-based QA systems. The FalseQA dataset and code are available at https://github.com/thunlp/FalseQA .",
+    "authors": [
+      "Shengding Hu",
+      "Yifan Luo",
+      "Huadong Wang",
+      "Xingyi Cheng",
+      "Zhiyuan Liu",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.309",
+    "point2d": [
+      63.046348571777344,
+      5.776121616363525
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 311,
+    "title": "What the DAAM: Interpreting Stable Diffusion Using Cross Attention",
+    "abstract": "Diffusion models are a milestone in text-to-image generation, but they remain poorly understood, lacking interpretability analyses. In this paper, we perform a text-image attribution analysis on Stable Diffusion, a recently open-sourced model. To produce attribution maps, we upscale and aggregate cross-attention maps in the denoising module, naming our method DAAM. We validate it by testing its segmentation ability on nouns, as well as its generalized attribution quality on all parts of speech, rated by humans. On two generated datasets, we attain a competitive 58.8-64.8 mIoU on noun segmentation and fair to good mean opinion scores (3.4-4.2) on generalized attribution. Then, we apply DAAM to study the role of syntax in the pixel space across head\u2013dependent heat map interaction patterns for ten common dependency relations. We show that, for some relations, the head map consistently subsumes the dependent, while the opposite is true for others. Finally, we study several semantic phenomena, focusing on feature entanglement; we find that the presence of cohyponyms worsens generation quality by 9%, and descriptive adjectives attend too broadly. We are the first to interpret large diffusion models from a visuolinguistic perspective, which enables future research. Our code is at https://github.com/castorini/daam.",
+    "authors": [
+      "Raphael Tang",
+      "Linqing Liu",
+      "Akshat Pandey",
+      "Zhiying Jiang",
+      "Gefei Yang",
+      "Karun Kumar",
+      "Pontus Stenetorp",
+      "Jimmy Lin",
+      "Ferhan Ture"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.310",
+    "point2d": [
+      -63.52618408203125,
+      46.11532211303711
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 312,
+    "title": "Zero-shot Faithful Factual Error Correction",
+    "abstract": "Faithfully correcting factual errors is critical for maintaining the integrity of textual knowledge bases and preventing hallucinations in sequence-to-sequence models. Drawing on humans\u2019 ability to identify and correct factual errors, we present a zero-shot framework that formulates questions about input claims, looks for correct answers in the given evidence, and assesses the faithfulness of each correction based on its consistency with the evidence. Our zero-shot framework outperforms fully-supervised approaches, as demonstrated by experiments on the FEVER and SciFact datasets, where our outputs are shown to be more faithful. More importantly, the decomposability nature of our framework inherently provides interpretability. Additionally, to reveal the most suitable metrics for evaluating factual error corrections, we analyze the correlation between commonly used metrics with human judgments in terms of three different dimensions regarding intelligibility and faithfulness.",
+    "authors": [
+      "Kung-Hsiang Huang",
+      "Hou Pong Chan",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.311",
+    "point2d": [
+      27.892547607421875,
+      6.907886505126953
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 313,
+    "title": "Open-Domain Hierarchical Event Schema Induction by Incremental Prompting and Verification",
+    "abstract": "Event schemas are a form of world knowledge about the typical progression of events. Recent methods for event schema induction use information extraction systems to construct a large number of event graph instances from documents, and then learn to generalize the schema from such instances. In contrast, we propose to treat event schemas as a form of commonsense knowledge that can be derived from large language models (LLMs). This new paradigm greatly simplifies the schema induction process and allows us to handle both hierarchical relations and temporal relations between events in a straightforward way. Since event schemas have complex graph structures, we design an incremental prompting and verification method IncPrompt to break down the construction of a complex event graph into three stages: event skeleton construction, event expansion, and event-event relation verification. Compared to directly using LLMs to generate a linearized graph, IncSchema can generate large and complex schemas with 7.2% F1 improvement in temporal relations and 31.0% F1 improvement in hierarchical relations. In addition, compared to the previous state-of-the-art closed-domain schema induction model, human assessors were able to cover ~10% more events when translating the schemas into coherent stories and rated our schemas 1.3 points higher (on a 5-point scale) in terms of readability.",
+    "authors": [
+      "Sha Li",
+      "Ruining Zhao",
+      "Manling Li",
+      "Heng Ji",
+      "Chris Callison-Burch",
+      "Jiawei Han"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.312",
+    "point2d": [
+      56.20844268798828,
+      -44.242916107177734
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 314,
+    "title": "Zero-shot Approach to Overcome Perturbation Sensitivity of Prompts",
+    "abstract": "Recent studies have demonstrated that natural-language prompts can help to leverage the knowledge learned by pre-trained language models for the binary sentence-level sentiment classification task. Specifically, these methods utilize few-shot learning settings to fine-tune the sentiment classification model using manual or automatically generated prompts. However, the performance of these methods is sensitive to the perturbations of the utilized prompts. Furthermore, these methods depend on a few labeled instances for automatic prompt generation and prompt ranking. This study aims to find high-quality prompts for the given task in a zero-shot setting. Given a base prompt, our proposed approach automatically generates multiple prompts similar to the base prompt employing positional, reasoning, and paraphrasing techniques and then ranks the prompts using a novel metric. We empirically demonstrate that the top-ranked prompts are high-quality and significantly outperform the base prompt and the prompts generated using few-shot learning for the binary sentence-level sentiment classification task.",
+    "authors": [
+      "Mohna Chakraborty",
+      "Adithya Kulkarni",
+      "Qi Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.313",
+    "point2d": [
+      -13.992992401123047,
+      -10.11070442199707
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 315,
+    "title": "Free Lunch: Robust Cross-Lingual Transfer via Model Checkpoint Averaging",
+    "abstract": "Massively multilingual language models have displayed strong performance in zero-shot (ZS-XLT) and few-shot (FS-XLT) cross-lingual transfer setups, where models fine-tuned on task data in a source language are transferred without any or with only a few annotated instances to the target language(s). However, current work typically overestimates model performance as fine-tuned models are frequently evaluated at model checkpoints that generalize best to validation instances in the target languages. This effectively violates the main assumptions of \u2018true\u2019 ZS-XLT and FS-XLT. Such XLT setups require robust methods that do not depend on labeled target language data for validation and model selection. In this work, aiming to improve the robustness of \u2018true\u2019 ZS-XLT and FS-XLT, we propose a simple and effective method that averages different checkpoints (i.e., model snapshots) during task fine-tuning. We conduct exhaustive ZS-XLT and FS-XLT experiments across higher-level semantic tasks (NLI, extractive QA) and lower-level token classification tasks (NER, POS). The results indicate that averaging model checkpoints yields systematic and consistent performance gains across diverse target languages in all tasks. Importantly, it simultaneously substantially desensitizes XLT to varying hyperparameter choices in the absence of target language validation. We also show that checkpoint averaging benefits performance when further combined with run averaging (i.e., averaging the parameters of models fine-tuned over independent runs).",
+    "authors": [
+      "Fabian David Schmidt",
+      "Ivan Vuli\u0107",
+      "Goran Glava\u0161"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.314",
+    "point2d": [
+      -59.51996994018555,
+      -19.41875648498535
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 316,
+    "title": "Cross-View Language Modeling: Towards Unified Cross-Lingual Cross-Modal Pre-training",
+    "abstract": "In this paper, we introduce Cross-View Language Modeling, a simple and effective pre-training framework that unifies cross-lingual and cross-modal pre-training with shared architectures and objectives. Our approach is motivated by a key observation that cross-lingual and cross-modal pre-training share the same goal of aligning two different views of the same object into a common semantic space. To this end, the cross-view language modeling framework considers both multi-modal data (i.e., image-caption pairs) and multi-lingual data (i.e., parallel sentence pairs) as two different views of the same object, and trains the model to align the two views by maximizing the mutual information between them with conditional masked language modeling and contrastive learning. We pre-train CCLM, a Cross-lingual Cross-modal Language Model, with the cross-view language modeling framework. Empirical results on IGLUE, a multi-lingual multi-modal benchmark, and two multi-lingual image-text retrieval datasets show that while conceptually simpler, CCLM significantly outperforms the prior state-of-the-art with an average absolute improvement of over 10%. Moreover, CCLM is the first multi-lingual multi-modal pre-trained model that surpasses the translate-test performance of representative English vision-language models by zero-shot cross-lingual transfer.",
+    "authors": [
+      "Yan Zeng",
+      "Wangchunshu Zhou",
+      "Ao Luo",
+      "Ziming Cheng",
+      "Xinsong Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.315",
+    "point2d": [
+      -57.07249450683594,
+      34.65837478637695
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 317,
+    "title": "Unsupervised Discontinuous Constituency Parsing with Mildly Context-Sensitive Grammars",
+    "abstract": "We study grammar induction with mildly context-sensitive grammars for unsupervised discontinuous parsing. Using the probabilistic linear context-free rewriting system (LCFRS) formalism, our approach fixes the rule structure in advance and focuses on parameter learning with maximum likelihood. To reduce the computational complexity of both parsing and parameter estimation, we restrict the grammar formalism to LCFRS-2 (i.e., binary LCFRS with fan-out two) and further discard rules that require O(l6) time to parse, reducing inference to O(l5). We find that using a large number of nonterminals is beneficial and thus make use of tensor decomposition-based rank-space dynamic programming with an embedding-based parameterization of rule probabilities to scale up the number of nonterminals. Experiments on German and Dutch show that our approach is able to induce linguistically meaningful trees with continuous and discontinuous structures.",
+    "authors": [
+      "Songlin Yang",
+      "Roger Levy",
+      "Yoon Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.316",
+    "point2d": [
+      -23.10361671447754,
+      -62.331153869628906
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 318,
+    "title": "Simplicity Bias in Transformers and their Ability to Learn Sparse Boolean Functions",
+    "abstract": "Despite the widespread success of Transformers on NLP tasks, recent works have found that they struggle to model several formal languages when compared to recurrent models. This raises the question of why Transformers perform well in practice and whether they have any properties that enable them to generalize better than recurrent models. In this work, we conduct an extensive empirical study on Boolean functions to demonstrate the following: (i) Random Transformers are relatively more biased towards functions of low sensitivity. (ii) When trained on Boolean functions, both Transformers and LSTMs prioritize learning functions of low sensitivity, with Transformers ultimately converging to functions of lower sensitivity. (iii) On sparse Boolean functions which have low sensitivity, we find that Transformers generalize near perfectly even in the presence of noisy labels whereas LSTMs overfit and achieve poor generalization accuracy. Overall, our results provide strong quantifiable evidence that suggests differences in the inductive biases of Transformers and recurrent models which may help explain Transformer\u2019s effective generalization performance despite relatively limited expressiveness.",
+    "authors": [
+      "Satwik Bhattamishra",
+      "Arkil Patel",
+      "Varun Kanade",
+      "Phil Blunsom"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.317",
+    "point2d": [
+      -25.784709930419922,
+      -47.0261116027832
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 319,
+    "title": "Counterspeeches up my sleeve! Intent Distribution Learning and Persistent Fusion for Intent-Conditioned Counterspeech Generation",
+    "abstract": "Counterspeech has been demonstrated to be an efficacious approach for combating hate speech. While various conventional and controlled approaches have been studied in recent years to generate counterspeech, a counterspeech with a certain intent may not be sufficient in every scenario. Due to the complex and multifaceted nature of hate speech, utilizing multiple forms of counter-narratives with varying intents may be advantageous in different circumstances. In this paper, we explore intent-conditioned counterspeech generation. At first, we develop IntentCONAN, a diversified intent-specific counterspeech dataset with 6831 counterspeeches conditioned on five intents, i.e., informative, denouncing, question, positive, and humour. Subsequently, we propose QUARC, a two-stage framework for intent-conditioned counterspeech generation. QUARC leverages vector-quantized representations learned for each intent category along with PerFuMe, a novel fusion module to incorporate intent-specific information into the model. Our evaluation demonstrates that QUARC outperforms several baselines by an average of ~10% across evaluation metrics. An extensive human evaluation supplements our hypothesis of better and more appropriate responses than comparative systems.",
+    "authors": [
+      "Rishabh Gupta",
+      "Shaily Desai",
+      "Manvi Goel",
+      "Anil Bandhakavi",
+      "Tanmoy Chakraborty",
+      "Md. Shad Akhtar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.318",
+    "point2d": [
+      12.49089527130127,
+      38.61567687988281
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 320,
+    "title": "DITTO: Data-efficient and Fair Targeted Subset Selection for ASR Accent Adaptation",
+    "abstract": "State-of-the-art Automatic Speech Recognition (ASR) systems are known to exhibit disparate performance on varying speech accents. To improve performance on a specific target accent, a commonly adopted solution is to finetune the ASR model using accent-specific labeled speech. However, acquiring large amounts of labeled speech for specific target accents is challenging. Choosing an informative subset of speech samples that are most representative of the target accents becomes important for effective ASR finetuning. To address this problem, we propose DITTO (Data-efficient and faIr Targeted subseT selectiOn that uses Submodular Mutual Information (SMI) functions as acquisition functions to find the most informative set of utterances matching a target accent within a fixed budget. An important feature of DITTO is that it supports fair targeting for multiple accents, i.e. it can automatically select representative data points from multiple accents when the ASR model needs to perform well on more than one accent. We show that compared to other speech selection methods, DITTO is 3-5 times as label-efficient for its improvements on the Indic-TTS and L2 datasets.",
+    "authors": [
+      "Suraj Kothawade",
+      "Anmol Mekala",
+      "D.Chandra Sekhara Hetha Havya",
+      "Mayank Kothyari",
+      "Rishabh Iyer",
+      "Ganesh Ramakrishnan",
+      "Preethi Jyothi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.319",
+    "point2d": [
+      -72.34213256835938,
+      15.712104797363281
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 321,
+    "title": "Verify-and-Edit: A Knowledge-Enhanced Chain-of-Thought Framework",
+    "abstract": "As large language models (LLMs) have become the norm in NLP, demonstrating good performance in generation and reasoning tasks, one of its most fatal disadvantages is the lack of factual correctness. Generating unfactual texts not only leads to lower performances but also degrades the trust and validity of their applications. Chain-of-Thought (CoT) prompting improves trust and model performance on complex reasoning tasks by generating interpretable reasoning chains, but still suffers from factuality concerns in knowledge-intensive tasks. In this paper, we propose the Verify-and-Edit framework for CoT prompting, which seeks to increase prediction factuality by post-editing reasoning chains according to external knowledge. Building on top of GPT-3, our framework lead to accuracy improvements in multiple open-domain question-answering tasks.",
+    "authors": [
+      "Ruochen Zhao",
+      "Xingxuan Li",
+      "Shafiq Joty",
+      "Chengwei Qin",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.320",
+    "point2d": [
+      45.61136245727539,
+      -10.383916854858398
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 322,
+    "title": "Bridging the Domain Gaps in Context Representations for k-Nearest Neighbor Neural Machine Translation",
+    "abstract": "\n        k-Nearest neighbor machine translation (kNN-MT) has attracted increasing attention due to its ability to non-parametrically adapt to new translation domains. By using an upstream NMT model to traverse the downstream training corpus, it is equipped with a datastore containing vectorized key-value pairs, which are retrieved during inference to benefit translation.However, there often exists a significant gap between upstream and downstream domains, which hurts the datastore retrieval and the final translation quality.To deal with this issue, we propose a novel approach to boost the datastore retrieval of kNN-MT by reconstructing the original datastore.Concretely, we design a reviser to revise the key representations, making them better fit for the downstream domain. The reviser is trained using the collected semantically-related key-queries pairs, and optimized by two proposed losses: one is the key-queries semantic distance ensuring each revised key representation is semantically related to its corresponding queries, and the other is an L2-norm loss encouraging revised key representations to effectively retain the knowledge learned by the upstream NMT model. Extensive experiments on domain adaptation tasks demonstrate that our method can effectively boost the datastore retrieval and translation quality of kNN-MT.Our code is available at https://github.com/DeepLearnXMU/Revised-knn-mt.",
+    "authors": [
+      "Zhiwei Cao",
+      "Baosong Yang",
+      "Huan Lin",
+      "Suhang Wu",
+      "Xiangpeng Wei",
+      "Dayiheng Liu",
+      "Jun Xie",
+      "Min Zhang",
+      "Jinsong Su"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.321",
+    "point2d": [
+      -65.84268188476562,
+      -14.628567695617676
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 323,
+    "title": "Node Placement in Argument Maps: Modeling Unidirectional Relations in High & Low-Resource Scenarios",
+    "abstract": "Argument maps structure discourse into nodes in a tree with each node being an argument that supports or opposes its parent argument. This format is more comprehensible and less redundant compared to an unstructured one. Exploring those maps and maintaining their structure by placing new arguments under suitable parents is more challenging for users with huge maps that are typical in online discussions. To support those users, we introduce the task of node placement: suggesting candidate nodes as parents for a new contribution. We establish an upper-bound of human performance, and conduct experiments with models of various sizes and training strategies. We experiment with a selection of maps from Kialo, drawn from a heterogeneous set of domains. Based on an annotation study, we highlight the ambiguity of the task that makes it challenging for both humans and models. We examine the unidirectional relation between tree nodes and show that encoding a node into different embeddings for each of the parent and child cases improves performance. We further show the few-shot effectiveness of our approach.",
+    "authors": [
+      "Iman Jundi",
+      "Neele Falk",
+      "Eva Maria Vecchi",
+      "Gabriella Lapesa"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.322",
+    "point2d": [
+      16.429061889648438,
+      -48.26296615600586
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 324,
+    "title": "Towards a Common Understanding of Contributing Factors for Cross-Lingual Transfer in Multilingual Language Models: A Review",
+    "abstract": "In recent years, pre-trained Multilingual Language Models (MLLMs) have shown a strong ability to transfer knowledge across different languages. However, given that the aspiration for such an ability has not been explicitly incorporated in the design of the majority of MLLMs, it is challenging to obtain a unique and straightforward explanation for its emergence. In this review paper, we survey literature that investigates different factors contributing to the capacity of MLLMs to perform zero-shot cross-lingual transfer and subsequently outline and discuss these factors in detail. To enhance the structure of this review and to facilitate consolidation with future studies, we identify five categories of such factors. In addition to providing a summary of empirical evidence from past studies, we identify consensuses among studies with consistent findings and resolve conflicts among contradictory ones. Our work contextualizes and unifies existing research streams which aim at explaining the cross-lingual potential of MLLMs. This review provides, first, an aligned reference point for future research and, second, guidance for a better-informed and more efficient way of leveraging the cross-lingual capacity of MLLMs.",
+    "authors": [
+      "Fred Philippy",
+      "Siwen Guo",
+      "Shohreh Haddadan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.323",
+    "point2d": [
+      -58.130943298339844,
+      -17.629182815551758
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 325,
+    "title": "Toward Human-Like Evaluation for Natural Language Generation with Error Analysis",
+    "abstract": "The pretrained language model (PLM) based metrics have been successfully used in evaluating language generation tasks. Recent studies of the human evaluation community show that considering both major errors (e.g. mistranslated tokens) and minor errors (e.g. imperfections in fluency) can produce high-quality judgments. This inspires us to approach the final goal of the automatic metrics (human-like evaluations) by fine-grained error analysis. In this paper, we argue that the ability to estimate sentence confidence is the tip of the iceberg for PLM-based metrics. And it can be used to refine the generated sentence toward higher confidence and more reference-grounded, where the costs of refining and approaching reference are used to determine the major and minor errors, respectively.To this end, we take BARTScore as the testbed and present an innovative solution to marry the unexploited sentence refining capacity of BARTScore and human-like error analysis, where the final score consists of both the evaluations of major and minor errors. Experiments show that our solution consistently and significantly improves BARTScore, and outperforms top-scoring metrics in 19/25 test settings. Analyses demonstrate our method robustly and efficiently approaches human-like evaluations, enjoying better interpretability. Our code and scripts will be publicly released in https://github.com/Coldmist-Lu/ErrorAnalysis_NLGEvaluation.",
+    "authors": [
+      "Qingyu Lu",
+      "Liang Ding",
+      "Liping Xie",
+      "Kanjian Zhang",
+      "Derek F. Wong",
+      "Dacheng Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.324",
+    "point2d": [
+      -20.733718872070312,
+      15.642860412597656
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 326,
+    "title": "Connective Prediction for Implicit Discourse Relation Recognition via Knowledge Distillation",
+    "abstract": "Implicit discourse relation recognition (IDRR) remains a challenging task in discourse analysis due to the absence of connectives. Most existing methods utilize one-hot labels as the sole optimization target, ignoring the internal association among connectives. Besides, these approaches spend lots of effort on template construction, negatively affecting the generalization capability. To address these problems,we propose a novel Connective Prediction via Knowledge Distillation (CP-KD) approach to instruct large-scale pre-trained language models (PLMs) mining the latent correlations between connectives and discourse relations, which is meaningful for IDRR. Experimental results on the PDTB 2.0/3.0 and CoNLL2016 datasets show that our method significantly outperforms the state-of-the-art models on coarse-grained and fine-grained discourse relations. Moreover, our approach can be transferred to explicit discourse relation recognition(EDRR) and achieve acceptable performance.",
+    "authors": [
+      "Hongyi Wu",
+      "Hao Zhou",
+      "Man Lan",
+      "Yuanbin Wu",
+      "Yadong Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.325",
+    "point2d": [
+      15.76049518585205,
+      -50.804969787597656
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 327,
+    "title": "What is the best recipe for character-level encoder-only modelling?",
+    "abstract": "This paper aims to benchmark recent progress in language understanding models that output contextualised representations at the character level. Many such modelling architectures and methods to train those architectures have been proposed, but it is currently unclear what the relative contributions of the architecture vs. the pretraining objective are to final model performance. We explore the design space of such models, comparing architectural innovations (Clark et al., 2022, Jaegle et al., 2022, Tay et al., 2021) and a variety of different pretraining objectives on a suite of evaluation tasks with a fixed training procedure in order to find the currently optimal way to build and train character-level BERT-like models. We find that our best performing character-level model exceeds the performance of a token-based model trained with the same settings on the same data, suggesting that character-level models are ready for more widespread adoption. Unfortunately, the best method to train character-level models still relies on a subword-level tokeniser during pretraining, and final model performance is highly dependent on tokeniser quality. We believe our results demonstrate the readiness of character-level models for multilingual language representation, and encourage NLP practitioners to try them as drop-in replacements for token-based models.",
+    "authors": [
+      "Kris Cao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.326",
+    "point2d": [
+      -32.71530532836914,
+      -33.309017181396484
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 328,
+    "title": "Unifying Cross-Lingual and Cross-Modal Modeling Towards Weakly Supervised Multilingual Vision-Language Pre-training",
+    "abstract": "Multilingual Vision-Language Pre-training (VLP) is a promising but challenging topic due to the lack of large-scale multilingual image-text pairs. Existing works address the problem by translating English data into other languages, which is intuitive and the generated data is usually limited in form and scale. In this paper, we explore a more practical and scalable setting: weakly supervised multilingual VLP with only English image-text pairs and multilingual text corpora. We argue that the universal multilingual representation learned from texts allows the cross-modal interaction learned in English to be transferable to other languages. To this end, we propose a framework to effectively unify cross-lingual and cross-modal pre-training. For unified modeling on different data, we design an architecture with flexible modules to learn different interactions. Moreover, two unified tasks are introduced to efficiently guide the unified cross-lingual cross-modal learning. Extensive experiments demonstrate that our pre-trained model learns universal multilingual multimodal representations, allowing effective cross-lingual transfer on multimodal tasks. Code and models are available at https://github.com/FudanDISC/weakly-supervised-mVLP.",
+    "authors": [
+      "Zejun Li",
+      "Zhihao Fan",
+      "Jingjing Chen",
+      "Qi Zhang",
+      "Xuanjing Huang",
+      "Zhongyu Wei"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.327",
+    "point2d": [
+      -56.959815979003906,
+      34.743316650390625
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 329,
+    "title": "Learning \u201cO\u201d Helps for Learning More: Handling the Unlabeled Entity Problem for Class-incremental NER",
+    "abstract": "As the categories of named entities rapidly increase, the deployed NER models are required to keep updating toward recognizing more entity types, creating a demand for class-incremental learning for NER. Considering the privacy concerns and storage constraints, the standard paradigm for class-incremental NER updates the models with training data only annotated with the new classes, yet the entities from other entity classes are regarded as \u201cNon-entity\u201d (or \u201cO\u201d). In this work, we conduct an empirical study on the \u201cUnlabeled Entity Problem\u201d and find that it leads to severe confusion between \u201cO\u201d and entities, decreasing class discrimination of old classes and declining the model\u2019s ability to learn new classes. To solve the Unlabeled Entity Problem, we propose a novel representation learning method to learn discriminative representations for the entity classes and \u201cO\u201d. Specifically, we propose an entity-aware contrastive learning method that adaptively detects entity clusters in \u201cO\u201d. Furthermore, we propose two effective distance-based relabeling strategies for better learning the old classes. We introduce a more realistic and challenging benchmark for class-incremental NER, and the proposed method achieves up to 10.62% improvement over the baseline methods.",
+    "authors": [
+      "Ruotian Ma",
+      "Xuanting Chen",
+      "Zhang Lin",
+      "Xin Zhou",
+      "Junzhe Wang",
+      "Tao Gui",
+      "Qi Zhang",
+      "Xiang Gao",
+      "Yun Wen Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.328",
+    "point2d": [
+      32.135780334472656,
+      -85.53325653076172
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 330,
+    "title": "Scene Graph as Pivoting: Inference-time Image-free Unsupervised Multimodal Machine Translation with Visual Scene Hallucination",
+    "abstract": "In this work, we investigate a more realistic unsupervised multimodal machine translation (UMMT) setup, inference-time image-free UMMT, where the model is trained with source-text image pairs, and tested with only source-text inputs. First, we represent the input images and texts with the visual and language scene graphs (SG), where such fine-grained vision-language features ensure a holistic understanding of the semantics. To enable pure-text input during inference, we devise a visual scene hallucination mechanism that dynamically generates pseudo visual SG from the given textual SG. Several SG-pivoting based learning objectives are introduced for unsupervised translation training. On the benchmark Multi30K data, our SG-based method outperforms the best-performing baseline by significant BLEU scores on the task and setup, helping yield translations with better completeness, relevance and fluency without relying on paired images. Further in-depth analyses reveal how our model advances in the task setting.",
+    "authors": [
+      "Hao Fei",
+      "Qian Liu",
+      "Meishan Zhang",
+      "Min Zhang",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.329",
+    "point2d": [
+      -66.65745544433594,
+      39.96936798095703
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 331,
+    "title": "CoLaDa: A Collaborative Label Denoising Framework for Cross-lingual Named Entity Recognition",
+    "abstract": "Cross-lingual named entity recognition (NER) aims to train an NER system that generalizes well to a target language by leveraging labeled data in a given source language. Previous work alleviates the data scarcity problem by translating source-language labeled data or performing knowledge distillation on target-language unlabeled data. However, these methods may suffer from label noise due to the automatic labeling process. In this paper, we propose CoLaDa, a Collaborative Label Denoising Framework, to address this problem. Specifically, we first explore a model-collaboration-based denoising scheme that enables models trained on different data sources to collaboratively denoise pseudo labels used by each other. We then present an instance-collaboration-based strategy that considers the label consistency of each token\u2019s neighborhood in the representation space for denoising. Experiments on different benchmark datasets show that the proposed CoLaDa achieves superior results compared to previous methods, especially when generalizing to distant languages.",
+    "authors": [
+      "Tingting Ma",
+      "Qianhui Wu",
+      "Huiqiang Jiang",
+      "B\u00f6rje Karlsson",
+      "Tiejun Zhao",
+      "Chin-Yew Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.330",
+    "point2d": [
+      30.510034561157227,
+      -87.69467163085938
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 332,
+    "title": "Dialect-robust Evaluation of Generated Text",
+    "abstract": "Text generation metrics that are not robust to dialect variation make it impossible to tell how well systems perform for many groups of users, and can even penalize systems for producing text in lower-resource dialects. In this paper, we introduce a suite of methods to assess whether metrics are dialect robust. These methods show that state-of-the-art metrics are not dialect robust: they often prioritize dialect similarity over semantics, preferring outputs that are semantically incorrect over outputs that match the semantics of the reference but contain dialect differences. As a step towards dialect-robust metrics for text generation, we propose NANO, which introduces regional and language information to the metric\u2019s pretraining. NANO significantly improves dialect robustness while preserving the correlation between automated metrics and human ratings. It also enables a more ambitious approach to evaluation, dialect awareness, in which system outputs are scored by both semantic match to the reference and appropriateness in any specified dialect.",
+    "authors": [
+      "Jiao Sun",
+      "Thibault Sellam",
+      "Elizabeth Clark",
+      "Tu Vu",
+      "Timothy Dozat",
+      "Dan Garrette",
+      "Aditya Siddhant",
+      "Jacob Eisenstein",
+      "Sebastian Gehrmann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.331",
+    "point2d": [
+      -18.122652053833008,
+      17.439306259155273
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 333,
+    "title": "Understanding and Improving the Robustness of Terminology Constraints in Neural Machine Translation",
+    "abstract": "In this work, we study the robustness of two typical terminology translation methods: Placeholder (PH) and Code-Switch (CS), concerning (1) the number of constraints and (2) the target constraint length. We identify that existing terminology constraint test sets, such as IATE, Wiktionary, and TICO, are blind to this issue due to oversimplified constraint settings. To solve it, we create a new challenging test set of English-German, increasing the average constraint count per sentence from 1.1~1.7 to 6.1 and the length per target constraint from 1.1~1.2 words to 3.4 words. Then we find that PH and CS methods degrade as the number of constraints increases, but they have complementary strengths. Specifically, PH is better at retaining high constraint accuracy but lower translation quality as measured by BLEU and COMET scores. In contrast, CS has the opposite results. Based on these observations, we propose a simple but effective method combining the advantages of PH and CS. This approach involves training a model like PH to predict the term labels, and then during inference replacing those labels with target terminology text like CS, so that the subsequent generation is aware of the target term content. Extensive experimental results show that this approach can achieve high constraint accuracy and translation quality simultaneously, regardless of the number or length of constraints.",
+    "authors": [
+      "Huaao Zhang",
+      "Qiang Wang",
+      "Bo Qin",
+      "Zelin Shi",
+      "Haibo Wang",
+      "Ming Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.332",
+    "point2d": [
+      -63.81901550292969,
+      -7.192802429199219
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 334,
+    "title": "Language model acceptability judgements are not always robust to context",
+    "abstract": "Targeted syntactic evaluations of language models ask whether models show stable preferences for syntactically acceptable content over minimal-pair unacceptable inputs. Our best syntactic evaluation datasets, however, provide substantially less linguistic context than models receive during pretraining. This mismatch raises an important question: how robust are models\u2019 syntactic judgements across different contexts? In this paper, we vary the input contexts based on: length, the types of syntactic phenomena it contains, and whether or not there are grammatical violations. We find that model judgements are generally robust when placed in randomly sampled linguistic contexts, but are unstable when contexts match the test stimuli in syntactic structure. Among all tested models (GPT-2 and five variants of OPT), we find that model performance is affected when we provided contexts with matching syntactic structure: performance significantly improves when contexts are acceptable, and it significantly declines when they are unacceptable. This effect is amplified by the length of the context, except for unrelated inputs. We show that these changes in model performance are not explainable by acceptability-preserving syntactic perturbations. This sensitivity to highly specific syntactic features of the context can only be explained by the models\u2019 implicit in-context learning abilities.",
+    "authors": [
+      "Koustuv Sinha",
+      "Jon Gauthier",
+      "Aaron Mueller",
+      "Kanishka Misra",
+      "Keren Fuentes",
+      "Roger Levy",
+      "Adina Williams"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.333",
+    "point2d": [
+      -28.74758529663086,
+      -41.54967498779297
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 335,
+    "title": "RobuT: A Systematic Study of Table QA Robustness Against Human-Annotated Adversarial Perturbations",
+    "abstract": "Despite significant progress having been made in question answering on tabular data (Table QA), it\u2019s unclear whether, and to what extent existing Table QA models are robust to task-specific perturbations, e.g., replacing key question entities or shuffling table columns. To systematically study the robustness of Table QA models, we propose a benchmark called RobuT, which builds upon existing Table QA datasets (WTQ, WikiSQL-Weak, and SQA) and includes human-annotated adversarial perturbations in terms of table header, table content, and question. Our results indicate that both state-of-the-art Table QA models and large language models (e.g., GPT-3) with few-shot learning falter in these adversarial sets. We propose to address this problem by using large language models to generate adversarial examples to enhance training, which significantly improves the robustness of Table QA models.",
+    "authors": [
+      "Yilun Zhao",
+      "Chen Zhao",
+      "Linyong Nan",
+      "Zhenting Qi",
+      "Wenlin Zhang",
+      "Xiangru Tang",
+      "Boyu Mi",
+      "Dragomir Radev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.334",
+    "point2d": [
+      76.8929443359375,
+      9.771888732910156
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 336,
+    "title": "Morphological Inflection: A Reality Check",
+    "abstract": "Morphological inflection is a popular task in sub-word NLP with both practical and cognitive applications. For years now, state-of-the-art systems have reported high, but also highly variable, performance across data sets and languages. We investigate the causes of this high performance and high variability; we find several aspects of data set creation and evaluation which systematically inflate performance and obfuscate differences between languages. To improve generalizability and reliability of results, we propose new data sampling and evaluation strategies that better reflect likely use-cases. Using these new strategies, we make new observations on the generalization abilities of current inflection systems.",
+    "authors": [
+      "Jordan Kodner",
+      "Sarah Payne",
+      "Salam Khalifa",
+      "Zoey Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.335",
+    "point2d": [
+      -35.91106033325195,
+      -42.75251007080078
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 337,
+    "title": "TOME: A Two-stage Approach for Model-based Retrieval",
+    "abstract": "Recently, model-based retrieval has emerged as a new paradigm in text retrieval that discards the index in the traditional retrieval model and instead memorizes the candidate corpora using model parameters. This design employs a sequence-to-sequence paradigm to generate document identifiers, which enables the complete capture of the relevance between queries and documents and simplifies the classic index-retrieval-rerank pipeline. Despite its attractive qualities, there remain several major challenges in model-based retrieval, including the discrepancy between pre-training and fine-tuning, and the discrepancy between training and inference. To deal with the above challenges, we propose a novel two-stage model-based retrieval approach called TOME, which makes two major technical contributions, including the utilization of tokenized URLs as identifiers and the design of a two-stage generation architecture. We also propose a number of training strategies to deal with the training difficulty as the corpus size increases. Extensive experiments and analysis on MS MARCO and Natural Questions demonstrate the effectiveness of our proposed approach, and we investigate the scaling laws of TOME by examining various influencing factors.",
+    "authors": [
+      "Ruiyang Ren",
+      "Wayne Xin Zhao",
+      "Jing Liu",
+      "Hua Wu",
+      "Ji-Rong Wen",
+      "Haifeng Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.336",
+    "point2d": [
+      15.770859718322754,
+      -16.600799560546875
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 338,
+    "title": "Using Neural Machine Translation for Generating Diverse Challenging Exercises for Language Learner",
+    "abstract": "We propose a novel approach to automatically generate distractors for cloze exercises for English language learners, using round-trip neural machine translation. A carrier sentence is translated from English into another (pivot) language and back, and distractors are produced by aligning the original sentence with its round-trip translation. We make use of 16 linguistically-diverse pivots and generate hundreds of translation hypotheses in each direction. We show that using hundreds of translations allows us to generate a rich set of challenging distractors. Moreover, we find that typologically unrelated language pivots contribute more diverse candidate distractors, compared to language pivots that are closely related. We further evaluate the use of machine translation systems of varying quality and find that better quality MT systems produce more challenging distractors. Finally, we conduct a study with language learners, demonstrating that the automatically generated distractors are of the same difficulty as the gold distractors produced by human experts.",
+    "authors": [
+      "Frank Palma Gomez",
+      "Subhadarshi Panda",
+      "Michael Flor",
+      "Alla Rozovskaya"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.337",
+    "point2d": [
+      -66.53800964355469,
+      -4.6952667236328125
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 339,
+    "title": "Similarity-weighted Construction of Contextualized Commonsense Knowledge Graphs for Knowledge-intense Argumentation Tasks",
+    "abstract": "Arguments often do not make explicit how a conclusion follows from its premises. To compensate for this lack, we enrich arguments with structured background knowledge to support knowledge-intense argumentation tasks. We present a new unsupervised method for constructing Contextualized Commonsense Knowledge Graphs (CCKGs) that selects contextually relevant knowledge from large knowledge graphs (KGs) efficiently and at high quality. Our work goes beyond context-insensitive knowledge extraction heuristics by computing semantic similarity between KG triplets and textual arguments. Using these triplet similarities as weights, we extract contextualized knowledge paths that connect a conclusion to its premise, while maximizing similarity to the argument. We combine multiple paths into a CCKG that we optionally prune to reduce noise and raise precision. Intrinsic evaluation of the quality of our graphs shows that our method is effective for (re)constructing human explanation graphs. Manual evaluations in a large-scale knowledge selection setup verify high recall and precision of implicit CSK in the CCKGs. Finally, we demonstrate the effectiveness of CCKGs in a knowledge-insensitive argument quality rating task, outperforming strong baselines and rivaling a GPT-3 based system.",
+    "authors": [
+      "Moritz Plenz",
+      "Juri Opitz",
+      "Philipp Heinisch",
+      "Philipp Cimiano",
+      "Anette Frank"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.338",
+    "point2d": [
+      64.21678161621094,
+      -11.782615661621094
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 340,
+    "title": "miCSE: Mutual Information Contrastive Learning for Low-shot Sentence Embeddings",
+    "abstract": "This paper presents miCSE, a mutual information-based contrastive learning framework that significantly advances the state-of-the-art in few-shot sentence embedding.The proposed approach imposes alignment between the attention pattern of different views during contrastive learning. Learning sentence embeddings with miCSE entails enforcing the structural consistency across augmented views for every sentence, making contrastive self-supervised learning more sample efficient. As a result, the proposed approach shows strong performance in the few-shot learning domain. While it achieves superior results compared to state-of-the-art methods on multiple benchmarks in few-shot learning, it is comparable in the full-shot scenario. This study opens up avenues for efficient self-supervised learning methods that are more robust than current contrastive methods for sentence embedding.",
+    "authors": [
+      "Tassilo Klein",
+      "Moin Nabi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.339",
+    "point2d": [
+      4.439911365509033,
+      -34.635009765625
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 341,
+    "title": "Learning Non-linguistic Skills without Sacrificing Linguistic Proficiency",
+    "abstract": "The field of Math-NLP has witnessed significant growth in recent years, motivated by the desire to expand LLM performance to the leaning of non-linguistic notions (numerals, and subsequently, arithmetic reasoning). However, non-linguistic skill injection typically comes at a cost for LLMs: it leads to catastrophic forgetting of core linguistic skills, a consequence that often remains unaddressed in the literature. As Math-NLP has been able to create LLMs that can closely approximate the mathematical skills of a grade schooler or the arithmetic reasoning skills of a calculator, the practicality of these models fail if they concomitantly shed their linguistic capabilities. In this work, we take a closer look into the phenomena of catastrophic forgetting as it pertains to LLMs and subsequently offer a novel framework for non-linguistic skill injection for LLMs based on information-theoretic interventions and skill-specific losses that enable the learning of strict arithmetic reasoning. Our model outperforms the state-of-the-art both on injected non-linguistic skills and on linguistic knowledge retention, and does so with a fraction of the non-linguistic training data (1/4) and zero additional synthetic linguistic training data.",
+    "authors": [
+      "Mandar Sharma",
+      "Nikhil Muralidhar",
+      "Naren Ramakrishnan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.340",
+    "point2d": [
+      43.23484802246094,
+      -18.335752487182617
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 342,
+    "title": "Forgotten Knowledge: Examining the Citational Amnesia in NLP",
+    "abstract": "Citing papers is the primary method through which modern scientific writing discusses and builds on past work. Collectively, citing a diverse set of papers (in time and area of study) is an indicator of how widely the community is reading. Yet, there is little work looking at broad temporal patterns of citation. This work systematically and empirically examines: How far back in time do we tend to go to cite papers? How has that changed over time, and what factors correlate with this citational attention/amnesia? We chose NLP as our domain of interest and analyzed approximately 71.5K papers to show and quantify several key trends in citation. Notably, around 62% of cited papers are from the immediate five years prior to publication, whereas only about 17% are more than ten years old. Furthermore, we show that the median age and age diversity of cited papers were steadily increasing from 1990 to 2014, but since then, the trend has reversed, and current NLP papers have an all-time low temporal citation diversity. Finally, we show that unlike the 1990s, the highly cited papers in the last decade were also papers with the least citation diversity, likely contributing to the intense (and arguably harmful) recency focus. Code, data, and a demo are available on the project homepage.",
+    "authors": [
+      "Janvijay Singh",
+      "Mukund Rungta",
+      "Diyi Yang",
+      "Saif Mohammad"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.341",
+    "point2d": [
+      17.181488037109375,
+      17.49143409729004
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 343,
+    "title": "Measuring the Instability of Fine-Tuning",
+    "abstract": "Fine-tuning pre-trained language models on downstream tasks with varying random seeds has been shown to be unstable, especially on small datasets. Many previous studies have investigated this instability and proposed methods to mitigate it. However, most of these studies only used the standard deviation of performance scores (SD) as their measure, which is a narrow characterization of instability. In this paper, we analyze SD and six other measures quantifying instability of different granularity levels. Moreover, we propose a systematic evaluation framework of these measures\u2019 validity. Finally, we analyze the consistency and difference between different measures by reassessing existing instability mitigation methods. We hope our results will inform better measurements of the fine-tuning instability.",
+    "authors": [
+      "Yupei Du",
+      "Dong Nguyen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.342",
+    "point2d": [
+      -32.763038635253906,
+      -15.864925384521484
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 344,
+    "title": "FairPrism: Evaluating Fairness-Related Harms in Text Generation",
+    "abstract": "It is critical to measure and mitigate fairness-related harms caused by AI text generation systems, including stereotyping and demeaning harms. To that end, we introduce FairPrism, a dataset of 5,000 examples of AI-generated English text with detailed human annotations covering a diverse set of harms relating to gender and sexuality. FairPrism aims to address several limitations of existing datasets for measuring and mitigating fairness-related harms, including improved transparency, clearer specification of dataset coverage, and accounting for annotator disagreement and harms that are context-dependent. FairPrism\u2019s annotations include the extent of stereotyping and demeaning harms, the demographic groups targeted, and appropriateness for different applications. The annotations also include specific harms that occur in interactive contexts and harms that raise normative concerns when the \u201cspeaker\u201d is an AI system. Due to its precision and granularity, FairPrism can be used to diagnose (1) the types of fairness-related harms that AI text generation systems cause, and (2) the potential limitations of mitigation methods, both of which we illustrate through case studies. Finally, the process we followed to develop FairPrism offers a recipe for building improved datasets for measuring and mitigating harms caused by AI systems.",
+    "authors": [
+      "Eve Fleisig",
+      "Aubrie Amstutz",
+      "Chad Atalla",
+      "Su Lin Blodgett",
+      "Hal Daum\u00e9 III",
+      "Alexandra Olteanu",
+      "Emily Sheng",
+      "Dan Vann",
+      "Hanna Wallach"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.343",
+    "point2d": [
+      26.16721534729004,
+      38.909217834472656
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 345,
+    "title": "Factually Consistent Summarization via Reinforcement Learning with Textual Entailment Feedback",
+    "abstract": "Despite the seeming success of contemporary grounded text generation systems, they often tend to generate factually inconsistent text with respect to their input.This phenomenon is emphasized in tasks like summarization, in which the generated summaries should be corroborated by their source article. In this work we leverage recent progress on textual entailment models to directly address this problem for abstractive summarization systems.We use reinforcement learning with reference-free, textual-entailment rewards to optimize for factual consistency and explore the ensuing trade-offs, as improved consistency may come at the cost of less informative or more extractive summaries.Our results, according to both automatic metrics and human evaluation, show that our method considerably improves the faithfulness, salience and conciseness of the generated summaries.",
+    "authors": [
+      "Paul Roit",
+      "Johan Ferret",
+      "Lior Shani",
+      "Roee Aharoni",
+      "Geoffrey Cideron",
+      "Robert Dadashi",
+      "Matthieu Geist",
+      "Sertan Girgin",
+      "Leonard Hussenot",
+      "Orgad Keller",
+      "Nikola Momchev",
+      "Sabela Ramos Garea",
+      "Piotr Stanczyk",
+      "Nino Vieillard",
+      "Olivier Bachem",
+      "Gal Elidan",
+      "Avinatan Hassidim",
+      "Olivier Pietquin",
+      "Idan Szpektor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.344",
+    "point2d": [
+      -4.3730292320251465,
+      45.43555450439453
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 346,
+    "title": "SIMMC-VR: A Task-oriented Multimodal Dialog Dataset with Situated and Immersive VR Streams",
+    "abstract": "Building an AI assistant that can seamlessly converse and instruct humans, in a user-centric situated scenario, requires several essential abilities:(1) spatial and temporal understanding of the situated and real-time user scenes,(2) capability of grounding the actively perceived visuals of users to conversation contexts,and (3) conversational reasoning over past utterances to perform just-in-time assistance.However, we currently lack a large-scale benchmark that captures user\u2013assistant interactions with all of the aforementioned features.To this end, we propose SIMMC-VR, an extension of the SIMMC-2.0 dataset, to a video-grounded task-oriented dialog dataset that captures real-world AI-assisted user scenarios in VR.We propose a novel data collection paradigm that involves(1) generating object-centric multimodal dialog flows with egocentric visual streams and visually-grounded templates,and (2) manually paraphrasing the simulated dialogs for naturalness and diversity while preserving multimodal dependencies. To measure meaningful progress in the field, we propose four tasks to address the new challenges in SIMMC-VR, which require complex spatial-temporal dialog reasoning in active egocentric scenes.We benchmark the proposed tasks with strong multimodal models, and highlight the key capabilities that current models lack for future research directions.",
+    "authors": [
+      "Te-Lin Wu",
+      "Satwik Kottur",
+      "Andrea Madotto",
+      "Mahmoud Azab",
+      "Pedro Rodriguez",
+      "Babak Damavandi",
+      "Nanyun Peng",
+      "Seungwhan Moon"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.345",
+    "point2d": [
+      8.534823417663574,
+      77.1856918334961
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 347,
+    "title": "Multilingual LLMs are Better Cross-lingual In-context Learners with Alignment",
+    "abstract": "In-context learning (ICL) unfolds as large language models become capable of inferring test labels conditioned on a few labeled samples without any gradient update. ICL-enabled large language models provide a promising step forward toward bypassing recurrent annotation costs in a low-resource setting. Yet, only a handful of past studies have explored ICL in a cross-lingual setting, in which the need for transferring label-knowledge from a high-resource language to a low-resource one is immensely crucial. To bridge the gap, we provide the first in-depth analysis of ICL for cross-lingual text classification. We find that the prevalent mode of selecting random input-label pairs to construct the prompt-context is severely limited in the case of cross-lingual ICL, primarily due to the lack of alignment in the input as well as the output spaces. To mitigate this, we propose a novel prompt construction strategy \u2014 Cross-lingual In-context Source Target Alignment (X-InSTA). With an injected coherence in the semantics of the input examples and a task-based alignment across the source and target languages, X-InSTA is able to outperform random prompt selection by a large margin across three different tasks using 44 different cross-lingual pairs.",
+    "authors": [
+      "Eshaan Tanwar",
+      "Subhabrata Dutta",
+      "Manish Borthakur",
+      "Tanmoy Chakraborty"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.346",
+    "point2d": [
+      -13.928801536560059,
+      -24.748632431030273
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 348,
+    "title": "APOLLO: A Simple Approach for Adaptive Pretraining of Language Models for Logical Reasoning",
+    "abstract": "Logical reasoning over text is an important ability that requires understanding the semantics of the text and reasoning through them to arrive at correct inferences. Prior works on pretraining language models to improve the logical reasoning ability require complex processing of training data (e.g., aligning symbolic knowledge to text), yielding task-specific data augmentation that is not easy to adapt to any general text corpus. In this work, we propose APOLLO, a simple adaptive pretraining approach to improve the logical reasoning skills of language models. We select a subset of Wikipedia for adaptive pretraining using a set of logical inference keywords as filter words. Further, we propose two self-supervised loss functions for training. First, we modify the masked language modeling loss only to mask specific parts-of-speech words that likely require higher-order reasoning to predict them. Second, we propose a sentence-level classification loss that teaches the model to distinguish between entailment and contradiction types of sentences. The proposed pretraining paradigm is both simple and independent of task formats. We demonstrate the effectiveness of APOLLO by comparing it with prior baselines on two logical reasoning datasets. APOLLO performs comparably on ReClor and outperforms baselines on LogiQA.",
+    "authors": [
+      "Soumya Sanyal",
+      "Yichong Xu",
+      "Shuohang Wang",
+      "Ziyi Yang",
+      "Reid Pryzant",
+      "Wenhao Yu",
+      "Chenguang Zhu",
+      "Xiang Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.347",
+    "point2d": [
+      42.917945861816406,
+      -13.665155410766602
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 349,
+    "title": "MultiTabQA: Generating Tabular Answers for Multi-Table Question Answering",
+    "abstract": "Recent advances in tabular question answering (QA) with large language models are constrained in their coverage and only answer questions over a single table. However, real-world queries are complex in nature, often over multiple tables in a relational database or web page. Single table questions do not involve common table operations such as set operations, Cartesian products (joins), or nested queries. Furthermore, multi-table operations often result in a tabular output, which necessitates table generation capabilities of tabular QA models. To fill this gap, we propose a new task of answering questions over multiple tables. Our model, MultiTabQA, not only answers questions over multiple tables, but also generalizes to generate tabular answers. To enable effective training, we build a pre-training dataset comprising of 132,645 SQL queries and tabular answers. Further, we evaluate the generated tables by introducing table-specific metrics of varying strictness assessing various levels of granularity of the table structure. MultiTabQA outperforms state-of-the-art single table QA models adapted to a multi-table QA setting by finetuning on three datasets: Spider, Atis and GeoQuery.",
+    "authors": [
+      "Vaishali Pal",
+      "Andrew Yates",
+      "Evangelos Kanoulas",
+      "Maarten de Rijke"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.348",
+    "point2d": [
+      77.65693664550781,
+      5.395281791687012
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 350,
+    "title": "To Copy Rather Than Memorize: A Vertical Learning Paradigm for Knowledge Graph Completion",
+    "abstract": "Embedding models have shown great power in knowledge graph completion (KGC) task. By learning structural constraints for each training triple, these methods implicitly memorize intrinsic relation rules to infer missing links. However, this paper points out that the multi-hop relation rules are hard to be reliably memorized due to the inherent deficiencies of such implicit memorization strategy, making embedding models underperform in predicting links between distant entity pairs. To alleviate this problem, we present Vertical Learning Paradigm (VLP), which extends embedding models by allowing to explicitly copy target information from related factual triples for more accurate prediction. Rather than solely relying on the implicit memory, VLP directly provides additional cues to improve the generalization ability of embedding models, especially making the distant link prediction significantly easier. Moreover, we also propose a novel relative distance based negative sampling technique (ReD) for more effective optimization. Experiments demonstrate the validity and generality of our proposals on two standard benchmarks. Our code is available at https://github.com/rui9812/VLP.",
+    "authors": [
+      "Rui Li",
+      "Xu Chen",
+      "Chaozhuo Li",
+      "Yanming Shen",
+      "Jianan Zhao",
+      "Yujing Wang",
+      "Weihao Han",
+      "Hao Sun",
+      "Weiwei Deng",
+      "Qi Zhang",
+      "Xing Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.349",
+    "point2d": [
+      56.0698127746582,
+      -65.78497314453125
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 351,
+    "title": "CoAD: Automatic Diagnosis through Symptom and Disease Collaborative Generation",
+    "abstract": "Automatic diagnosis (AD), a critical application of AI in healthcare, employs machine learning techniques to assist doctors in gathering patient symptom information for precise disease diagnosis. The Transformer-based method utilizes an input symptom sequence, predicts itself through auto-regression, and employs the hidden state of the final symptom to determine the disease. Despite its simplicity and superior performance demonstrated, a decline in disease diagnosis accuracy is observed caused by 1) a mismatch between symptoms observed during training and generation, and 2) the effect of different symptom orders on disease prediction. To address the above obstacles, we introduce the CoAD, a novel disease and symptom collaborative generation framework, which incorporates several key innovations to improve AD: 1) aligning sentence-level disease labels with multiple possible symptom inquiry steps to bridge the gap between training and generation; 2) expanding symptom labels for each sub-sequence of symptoms to enhance annotation and eliminate the effect of symptom order; 3) developing a repeated symptom input schema to effectively and efficiently learn the expanded disease and symptom labels. We evaluate the CoAD framework using four datasets, including three public and one private, and demonstrate that it achieves an average 2.3% improvement over previous state-of-the-art results in automatic disease diagnosis. For reproducibility, we release the code and data at https://github.com/KwanWaiChung/coad.",
+    "authors": [
+      "Huimin Wang",
+      "Wai Chung Kwan",
+      "Kam-Fai Wong",
+      "Yefeng Zheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.350",
+    "point2d": [
+      32.149322509765625,
+      -40.781227111816406
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 352,
+    "title": "Long-Tailed Question Answering in an Open World",
+    "abstract": "Real-world data often have an open long-tailed distribution, and building a unified QA model supporting various tasks is vital for practical QA applications.However, it is non-trivial to extend previous QA approaches since they either require access to seen tasks of adequate samples or do not explicitly model samples from unseen tasks.In this paper, we define Open Long-Tailed QA (OLTQA) as learning from long-tailed distributed data and optimizing performance over seen and unseen QA tasks.We propose an OLTQA model that encourages knowledge sharing between head, tail and unseen tasks, and explicitly mines knowledge from a large pre-trained language model (LM).Specifically, we organize our model through a pool of fine-grained components and dynamically combine these components for an input to facilitate knowledge sharing.A retrieve-then-rerank frame is further introduced to select in-context examples, which guild the LM to generate text that express knowledge for QA tasks.Moreover, a two-stage training approach is introduced to pre-train the framework by knowledge distillation (KD) from the LM and then jointly train the frame and a QA model through an adaptive mutual KD method.On a large-scale OLTQA dataset we curate from 43 existing QA datasets, our model consistently outperforms the state-of-the-art.",
+    "authors": [
+      "Yi Dai",
+      "Hao Lang",
+      "Yinhe Zheng",
+      "Fei Huang",
+      "Yongbin Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.351",
+    "point2d": [
+      61.810455322265625,
+      10.341920852661133
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 353,
+    "title": "Parallel Context Windows for Large Language Models",
+    "abstract": "When applied to processing long text, Large Language Models (LLMs) are limited by their context window. Existing efforts to address this limitation involve training specialized architectures, and cannot be easily applied to off- the-shelf LLMs. We present Parallel Context Windows (PCW), a method that alleviates the context window restriction for any off-the-shelf LLM without further training. The key to the approach is to carve a long context into chunks (\u201cwindows\u201d), restrict the attention mechanism to apply only within each window, and re-use the positional embeddings across the windows. Our main results test the PCW approach on in-context learning with models that range in size between 750 million and 178 billion parameters, and show substantial improvements for tasks with diverse input and output spaces. We show additional benefits in other settings where long context windows may be beneficial: multi-hop questions and retrieval-augmented question answering with multiple retrieved documents. Our results highlight Parallel Context Windows as a promising method for applying off-the-shelf LLMs in a range of settings that require long text sequences. We make our code publicly available at https://github.com/ ai21labs/parallel-context-windows.",
+    "authors": [
+      "Nir Ratner",
+      "Yoav Levine",
+      "Yonatan Belinkov",
+      "Ori Ram",
+      "Inbal Magar",
+      "Omri Abend",
+      "Ehud Karpas",
+      "Amnon Shashua",
+      "Kevin Leyton-Brown",
+      "Yoav Shoham"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.352",
+    "point2d": [
+      -16.962846755981445,
+      -25.18450927734375
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 354,
+    "title": "Efficient Transformers with Dynamic Token Pooling",
+    "abstract": "Transformers achieve unrivalled performance in modelling language, but remain inefficient in terms of memory and time complexity. A possible remedy is to reduce the sequence length in the intermediate layers by pooling fixed-length segments of tokens. Nevertheless, natural units of meaning, such as words or phrases, display varying sizes. To address this mismatch, we equip language models with a dynamic-pooling mechanism, which predicts segment boundaries in an autoregressive fashion. We compare several methods to infer boundaries, including end-to-end learning through stochastic re-parameterisation, supervised learning (based on segmentations from subword tokenizers or spikes in conditional entropy), as well as linguistically motivated boundaries. We perform character-level evaluation on texts from multiple datasets and morphologically diverse languages. The results demonstrate that dynamic pooling, which jointly segments and models language, is both faster and more accurate than vanilla Transformers and fixed-length pooling within the same computational budget.",
+    "authors": [
+      "Piotr Nawrot",
+      "Jan Chorowski",
+      "Adrian Lancucki",
+      "Edoardo Maria Ponti"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.353",
+    "point2d": [
+      -35.563140869140625,
+      -31.897016525268555
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 355,
+    "title": "Did the Models Understand Documents? Benchmarking Models for Language Understanding in Document-Level Relation Extraction",
+    "abstract": "Document-level relation extraction (DocRE) attracts more research interest recently. While models achieve consistent performance gains in DocRE, their underlying decision rules are still understudied: Do they make the right predictions according to rationales? In this paper, we take the first step toward answering this question and then introduce a new perspective on comprehensively evaluating a model.Specifically, we first conduct annotations to provide the rationales considered by humans in DocRE. Then, we conduct investigations and discover the fact that: In contrast to humans, the representative state-of-the-art (SOTA) models in DocRE exhibit different reasoning processes. Through our proposed RE-specific attacks, we next demonstrate that the significant discrepancy in decision rules between models and humans severely damages the robustness of models. After that, we introduce mean average precision (MAP) to evaluate the understanding and reasoning capabilities of models. According to the extensive experimental results, we finally appeal to future work to consider evaluating the understanding ability of models because the improved ability renders models more trustworthy and robust to be deployed in real-world scenarios. We make our annotations and code publicly available.",
+    "authors": [
+      "Haotian Chen",
+      "Bingsheng Chen",
+      "Xiangdong Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.354",
+    "point2d": [
+      41.76258087158203,
+      -62.19569396972656
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 356,
+    "title": "ContraCLM: Contrastive Learning For Causal Language Model",
+    "abstract": "Despite exciting progress in causal language models, the expressiveness of their representations is largely limited due to poor discrimination ability. To remedy this issue, we present CONTRACLM, a novel contrastive learning framework at both the token-level and the sequence-level. We assess CONTRACLM on a variety of downstream tasks. We show that CONTRACLM enhances the discrimination of representations and bridges the gap with encoder-only models, which makes causal language models better suited for tasks beyond language generation. Specifically, we attain 44% relative improvement on the Semantic Textual Similarity tasks and 34% on Code-to-Code Search tasks. Furthermore, by improving the expressiveness of representations, CONTRACLM also boosts the source code generation capability with 9% relative improvement on execution accuracy on the HumanEval benchmark.",
+    "authors": [
+      "Nihal Jain",
+      "Dejiao Zhang",
+      "Wasi Uddin Ahmad",
+      "Zijian Wang",
+      "Feng Nan",
+      "Xiaopeng Li",
+      "Ming Tan",
+      "Ramesh Nallapati",
+      "Baishakhi Ray",
+      "Parminder Bhatia",
+      "Xiaofei Ma",
+      "Bing Xiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.355",
+    "point2d": [
+      -11.541337966918945,
+      -49.29712677001953
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 357,
+    "title": "Advancing Multi-Criteria Chinese Word Segmentation Through Criterion Classification and Denoising",
+    "abstract": "Recent research on multi-criteria Chinese word segmentation (MCCWS) mainly focuses on building complex private structures, adding more handcrafted features, or introducing complex optimization processes.In this work, we show that through a simple yet elegant input-hint-based MCCWS model, we can achieve state-of-the-art (SoTA) performances on several datasets simultaneously.We further propose a novel criterion-denoising objective that hurts slightly on F1 score but achieves SoTA recall on out-of-vocabulary words.Our result establishes a simple yet strong baseline for future MCCWS research.Source code is available at https://github.com/IKMLab/MCCWS.",
+    "authors": [
+      "Tzu Hsuan Chou",
+      "Chun-Yi Lin",
+      "Hung-Yu Kao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.356",
+    "point2d": [
+      -46.203495025634766,
+      7.523011207580566
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 358,
+    "title": "Infusing Hierarchical Guidance into Prompt Tuning: A Parameter-Efficient Framework for Multi-level Implicit Discourse Relation Recognition",
+    "abstract": "Multi-level implicit discourse relation recognition (MIDRR) aims at identifying hierarchical discourse relations among arguments. Previous methods achieve the promotion through fine-tuning PLMs. However, due to the data scarcity and the task gap, the pre-trained feature space cannot be accurately tuned to the task-specific space, which even aggravates the collapse of the vanilla space. Besides, the comprehension of hierarchical semantics for MIDRR makes the conversion much harder. In this paper, we propose a prompt-based Parameter-Efficient Multi-level IDRR (PEMI) framework to solve the above problems. First, we leverage parameter-efficient prompt tuning to drive the inputted arguments to match the pre-trained space and realize the approximation with few parameters. Furthermore, we propose a hierarchical label refining (HLR) method for the prompt verbalizer to deeply integrate hierarchical guidance into the prompt tuning. Finally, our model achieves comparable results on PDTB 2.0 and 3.0 using about 0.1% trainable parameters compared with baselines and the visualization demonstrates the effectiveness of our HLR method.",
+    "authors": [
+      "Haodong Zhao",
+      "Ruifang He",
+      "Mengnan Xiao",
+      "Jing Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.357",
+    "point2d": [
+      15.439607620239258,
+      -49.462371826171875
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 359,
+    "title": "Contrastive Learning with Adversarial Examples for Alleviating Pathology of Language Model",
+    "abstract": "Neural language models have achieved superior performance. However, these models also suffer from the pathology of overconfidence in the out-of-distribution examples, potentially making the model difficult to interpret and making the interpretation methods fail to provide faithful attributions. In this paper, we explain the model pathology from the view of sentence representation and argue that the counter-intuitive bias degree and direction of the out-of-distribution examples\u2019 representation cause the pathology. We propose a Contrastive learning regularization method using Adversarial examples for Alleviating the Pathology (ConAAP), which calibrates the sentence representation of out-of-distribution examples. ConAAP generates positive and negative examples following the attribution results and utilizes adversarial examples to introduce direction information in regularization. Experiments show that ConAAP effectively alleviates the model pathology while slightly impacting the generalization ability on in-distribution examples and thus helps interpretation methods obtain more faithful results.",
+    "authors": [
+      "Pengwei Zhan",
+      "Jing Yang",
+      "Xiao Huang",
+      "Chunlei Jing",
+      "Jingying Li",
+      "Liming Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.358",
+    "point2d": [
+      3.8205950260162354,
+      3.3179478645324707
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 360,
+    "title": "Are Fairy Tales Fair? Analyzing Gender Bias in Temporal Narrative Event Chains of Children\u2019s Fairy Tales",
+    "abstract": "Social biases and stereotypes are embedded in our culture in part through their presence in our stories, as evidenced by the rich history of humanities and social science literature analyzing such biases in children stories. Because these analyses are often conducted manually and at a small scale, such investigations can benefit from the use of more recent natural language processing (NLP) methods that examine social bias in models and data corpora. Our work joins this interdisciplinary effort and makes a unique contribution by taking into account the event narrative structures when analyzing the social bias of stories. We propose a computational pipeline that automatically extracts a story\u2019s temporal narrative verb-based event chain for each of its characters as well as character attributes such as gender. We also present a verb-based event annotation scheme that can facilitate bias analysis by including categories such as those that align with traditional stereotypes. Through a case study analyzing gender bias in fairy tales, we demonstrate that our framework can reveal bias in not only the unigram verb-based events in which female and male characters participate but also in the temporal narrative order of such event participation.",
+    "authors": [
+      "Paulina Toro Isaza",
+      "Guangxuan Xu",
+      "Toye Oloko",
+      "Yufang Hou",
+      "Nanyun Peng",
+      "Dakuo Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.359",
+    "point2d": [
+      27.318044662475586,
+      35.52957534790039
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 361,
+    "title": "FutureTOD: Teaching Future Knowledge to Pre-trained Language Model for Task-Oriented Dialogue",
+    "abstract": "Pre-trained language models based on general text enable huge success in the NLP scenario. But the intrinsical difference of linguistic patterns between general text and task-oriented dialogues makes existing pre-trained language models less useful in practice. Current dialogue pre-training methods rely on a contrastive framework and face the challenges of both selecting true positives and hard negatives. In this paper, we propose a novel dialogue pre-training model, FutureTOD, which distills future knowledge to the representation of the previous dialogue context using a self-training framework. Our intuition is that a good dialogue representation both learns local context information and predicts future information. Extensive experiments on diverse downstream dialogue tasks demonstrate the effectiveness of our model, especially the generalization, robustness, and learning discriminative dialogue representations capabilities.",
+    "authors": [
+      "Weihao Zeng",
+      "Keqing He",
+      "Yejie Wang",
+      "Chen Zeng",
+      "Jingang Wang",
+      "Yunsen Xian",
+      "Weiran Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.360",
+    "point2d": [
+      5.366441249847412,
+      67.24922943115234
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 362,
+    "title": "LAMBADA: Backward Chaining for Automated Reasoning in Natural Language",
+    "abstract": "Remarkable progress has been made on automated reasoning with natural text, by using Large Language Models (LLMs) and methods such as Chain-of-Thought prompting and Selection-Inference. These techniques search for proofs in the forward direction from axioms to the conclusion, which suffers from a combinatorial explosion of the search space, and thus high failure rates for problems requiring longer chains of reasoning. The classical automated reasoning literature has shown that reasoning in the backward direction (i.e. from intended conclusion to supporting axioms) is significantly more efficient at proof-finding. Importing this intuition into the LM setting, we develop a Backward Chaining algorithm, called LAMBADA, that decomposes reasoning into four sub-modules, that are simply implemented by few-shot prompted LLM inference. We show that LAMBADA achieves sizable accuracy boosts over state-of-the-art forward reasoning methods on two challenging logical reasoning datasets, particularly when deep and accurate proof chains are required.",
+    "authors": [
+      "Mehran Kazemi",
+      "Najoung Kim",
+      "Deepti Bhatia",
+      "Xin Xu",
+      "Deepak Ramachandran"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.361",
+    "point2d": [
+      50.47209167480469,
+      -17.228717803955078
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 363,
+    "title": "PeaCoK: Persona Commonsense Knowledge for Consistent and Engaging Narratives",
+    "abstract": "Sustaining coherent and engaging narratives requires dialogue or storytelling agents to understandhow the personas of speakers or listeners ground the narrative. Specifically, these agents must infer personas of their listeners to produce statements that cater to their interests. They must also learn to maintain consistent speaker personas for themselves throughout the narrative, so that their counterparts feel involved in a realistic conversation or story.However, personas are diverse and complex: they entail large quantities of rich interconnected world knowledge that is challenging to robustly represent in general narrative systems (e.g., a singer is good at singing, and may have attended conservatoire). In this work, we construct a new large-scale persona commonsense knowledge graph, PeaCoK, containing ~100K human-validated persona facts. Our knowledge graph schematizes five dimensions of persona knowledge identified in previous studies of human interactive behaviours, and distils facts in this schema from both existing commonsense knowledge graphs and large-scale pretrained language models. Our analysis indicates that PeaCoK contains rich and precise world persona inferences that help downstream systems generate more consistent and engaging narratives.",
+    "authors": [
+      "Silin Gao",
+      "Beatriz Borges",
+      "Soyoung Oh",
+      "Deniz Bayazit",
+      "Saya Kanno",
+      "Hiromi Wakaki",
+      "Yuki Mitsufuji",
+      "Antoine Bosselut"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.362",
+    "point2d": [
+      11.497509956359863,
+      53.700199127197266
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 364,
+    "title": "OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment",
+    "abstract": "Speech Recognition builds a bridge between the multimedia streaming (audio-only, visual-only or audio-visual) and the corresponding text transcription. However, when training the specific model of new domain, it often gets stuck in the lack of new-domain utterances, especially the labeled visual utterances. To break through this restriction, we attempt to achieve zero-shot modality transfer by maintaining the multi-modality alignment in phoneme space learned with unlabeled multimedia utterances in the high resource domain during the pre-training, and propose a training system Open-modality Speech Recognition (OpenSR) that enables the models trained on a single modality (e.g., audio-only) applicable to more modalities (e.g., visual-only and audio-visual). Furthermore, we employ a cluster-based prompt tuning strategy to handle the domain shift for the scenarios with only common words in the new domain utterances. We demonstrate that OpenSR enables modality transfer from one to any in three different settings (zero-, few- and full-shot), and achieves highly competitive zero-shot performance compared to the existing few-shot and full-shot lip-reading methods. To the best of our knowledge, OpenSR achieves the state-of-the-art performance of word error rate in LRS2 on audio-visual speech recognition and lip-reading with 2.7% and 25.0%, respectively.",
+    "authors": [
+      "Xize Cheng",
+      "Tao Jin",
+      "Linjun Li",
+      "Wang Lin",
+      "Xinyu Duan",
+      "Zhou Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.363",
+    "point2d": [
+      -67.88214874267578,
+      25.100997924804688
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 365,
+    "title": "Retrieval-free Knowledge Injection through Multi-Document Traversal for Dialogue Models",
+    "abstract": "Dialogue models are often enriched with extensive external knowledge to provide informative responses through a retrieval-augmented pipeline.Nevertheless, retrieval-augmented approaches rely on finely annotated retrieval training data and knowledge-grounded response generation data, making it costly to transfer. To tackle this challenge, this paper proposed a retrieval-free approach, KiDG, by automatically turning knowledge documents into simulated multi-turn dialogues through a Multi-Document Traversal algorithm. The simulated knowledge-intensive dialogues constructed by KiDG in one domain can be easily used to train and enhance pre-trained dialogue models\u2019 knowledge w.r.t. this domain without costly annotation.We conduct extensive experiments comparing retrieval-augmented models and a variety of retrieval-free models. We found that dialogue models enhanced with data simulated with KiDG largely outperform state-of-the-art retrieval-free methods, and it achieves comparable performance compared to retrieval-augmented methods while being better, and cheaper at domain transfer.",
+    "authors": [
+      "Rui Wang",
+      "Jianzhu Bao",
+      "Fei Mi",
+      "Yi Chen",
+      "Hongru Wang",
+      "Yasheng Wang",
+      "Yitong Li",
+      "Lifeng Shang",
+      "Kam-Fai Wong",
+      "Ruifeng Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.364",
+    "point2d": [
+      16.48980140686035,
+      58.692081451416016
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 366,
+    "title": "BERM: Training the Balanced and Extractable Representation for Matching to Improve Generalization Ability of Dense Retrieval",
+    "abstract": "Dense retrieval has shown promise in the first-stage retrieval process when trained on in-domain labeled datasets. However, previous studies have found that dense retrieval is hard to generalize to unseen domains due to its weak modeling of domain-invariant and interpretable feature (i.e., matching signal between two texts, which is the essence of information retrieval). In this paper, we propose a novel method to improve the generalization of dense retrieval via capturing matching signal called BERM. Fully fine-grained expression and query-oriented saliency are two properties of the matching signal. Thus, in BERM, a single passage is segmented into multiple units and two unit-level requirements are proposed for representation as the constraint in training to obtain the effective matching signal. One is semantic unit balance and the other is essential matching unit extractability. Unit-level view and balanced semantics make representation express the text in a fine-grained manner. Essential matching unit extractability makes passage representation sensitive to the given query to extract the pure matching information from the passage containing complex context. Experiments on BEIR show that our method can be effectively combined with different dense retrieval training methods (vanilla, hard negatives mining and knowledge distillation) to improve its generalization ability without any additional inference overhead and target domain data.",
+    "authors": [
+      "Shicheng Xu",
+      "Liang Pang",
+      "Huawei Shen",
+      "Xueqi Cheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.365",
+    "point2d": [
+      14.635202407836914,
+      -13.37197208404541
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 367,
+    "title": "Multiview Identifiers Enhanced Generative Retrieval",
+    "abstract": "Instead of simply matching a query to pre-existing passages, generative retrieval generates identifier strings of passages as the retrieval target. At a cost, the identifier must be distinctive enough to represent a passage. Current approaches use either a numeric ID or a text piece (such as a title or substrings) as the identifier. However, these identifiers cannot cover a passage\u2019s content well. As such, we are motivated to propose a new type of identifier, synthetic identifiers, that are generated based on the content of a passage and could integrate contextualized information that text pieces lack. Furthermore, we simultaneously consider multiview identifiers, including synthetic identifiers, titles, and substrings. These views of identifiers complement each other and facilitate the holistic ranking of passages from multiple perspectives. We conduct a series of experiments on three public datasets, and the results indicate that our proposed approach performs the best in generative retrieval, demonstrating its effectiveness and robustness.",
+    "authors": [
+      "Yongqi Li",
+      "Nan Yang",
+      "Liang Wang",
+      "Furu Wei",
+      "Wenjie Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.366",
+    "point2d": [
+      16.087783813476562,
+      -14.76993179321289
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 368,
+    "title": "Prompting Language Models for Linguistic Structure",
+    "abstract": "Although pretrained language models (PLMs) can be prompted to perform a wide range of language tasks, it remains an open question how much this ability comes from generalizable linguistic understanding versus surface-level lexical patterns. To test this, we present a structured prompting approach for linguistic structured prediction tasks, allowing us to perform zero- and few-shot sequence tagging with autoregressive PLMs. We evaluate this approach on part-of-speech tagging, named entity recognition, and sentence chunking, demonstrating strong few-shot performance in all cases. We also find that while PLMs contain significant prior knowledge of task labels due to task leakage into the pretraining corpus, structured prompting can also retrieve linguistic structure with arbitrary labels. These findings indicate that the in-context learning ability and linguistic knowledge of PLMs generalizes beyond memorization of their training data.",
+    "authors": [
+      "Terra Blevins",
+      "Hila Gonen",
+      "Luke Zettlemoyer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.367",
+    "point2d": [
+      -16.31955337524414,
+      -11.868894577026367
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 369,
+    "title": "Trillion Dollar Words: A New Financial Dataset, Task & Market Analysis",
+    "abstract": "Monetary policy pronouncements by Federal Open Market Committee (FOMC) are a major driver of financial market returns. We construct the largest tokenized and annotated dataset of FOMC speeches, meeting minutes, and press conference transcripts in order to understand how monetary policy influences financial markets. In this study, we develop a novel task of hawkish-dovish classification and benchmark various pre-trained language models on the proposed dataset. Using the best-performing model (RoBERTa-large), we construct a measure of monetary policy stance for the FOMC document release days. To evaluate the constructed measure, we study its impact on the treasury market, stock market, and macroeconomic indicators. Our dataset, models, and code are publicly available on Huggingface and GitHub under CC BY-NC 4.0 license.",
+    "authors": [
+      "Agam Shah",
+      "Suvan Paturi",
+      "Sudheer Chava"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.368",
+    "point2d": [
+      34.75077438354492,
+      3.9346675872802734
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 370,
+    "title": "RE-Matching: A Fine-Grained Semantic Matching Method for Zero-Shot Relation Extraction",
+    "abstract": "Semantic matching is a mainstream paradigm of zero-shot relation extraction, which matches a given input with a corresponding label description. The entities in the input should exactly match their hypernyms in the description, while the irrelevant contexts should be ignored when matching.However, general matching methods lack explicit modeling of the above matching pattern. In this work, we propose a fine-grained semantic matching method tailored for zero-shot relation extraction. Guided by the above matching pattern, we decompose the sentence-level similarity score into the entity matching score and context matching score. Considering that not all contextual words contribute equally to the relation semantics, we design a context distillation module to reduce the negative impact of irrelevant components on context matching. Experimental results show that our method achieves higher matching accuracy and more than 10 times faster inference speed, compared with the state-of-the-art methods.",
+    "authors": [
+      "Jun Zhao",
+      "WenYu Zhan",
+      "Xin Zhao",
+      "Qi Zhang",
+      "Tao Gui",
+      "Zhongyu Wei",
+      "Junzhe Wang",
+      "Minlong Peng",
+      "Mingming Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.369",
+    "point2d": [
+      34.124942779541016,
+      -77.26263427734375
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 371,
+    "title": "SQuARe: A Large-Scale Dataset of Sensitive Questions and Acceptable Responses Created through Human-Machine Collaboration",
+    "abstract": "The potential social harms that large language models pose, such as generating offensive content and reinforcing biases, are steeply rising. Existing works focus on coping with this concern while interacting with ill-intentioned users, such as those who explicitly make hate speech or elicit harmful responses. However, discussions on sensitive issues can become toxic even if the users are well-intentioned. For safer models in such scenarios, we present the Sensitive Questions and Acceptable Response (SQuARe) dataset, a large-scale Korean dataset of 49k sensitive questions with 42k acceptable and 46k non-acceptable responses. The dataset was constructed leveraging HyperCLOVA in a human-in-the-loop manner based on real news headlines. Experiments show that acceptable response generation significantly improves for HyperCLOVA and GPT-3, demonstrating the efficacy of this dataset.",
+    "authors": [
+      "Hwaran Lee",
+      "Seokhee Hong",
+      "Joonsuk Park",
+      "Takyoung Kim",
+      "Meeyoung Cha",
+      "Yejin Choi",
+      "Byoungpil Kim",
+      "Gunhee Kim",
+      "Eun-Ju Lee",
+      "Yong Lim",
+      "Alice Oh",
+      "Sangchul Park",
+      "Jung-Woo Ha"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.370",
+    "point2d": [
+      17.022071838378906,
+      34.41875076293945
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 372,
+    "title": "Towards standardizing Korean Grammatical Error Correction: Datasets and Annotation",
+    "abstract": "Research on Korean grammatical error correction (GEC) is limited, compared to other major languages such as English. We attribute this problematic circumstance to the lack of a carefully designed evaluation benchmark for Korean GEC. In this work, we collect three datasets from different sources (Kor-Lang8, Kor-Native, and Kor-Learner) that covers a wide range of Korean grammatical errors. Considering the nature of Korean grammar, We then define 14 error types for Korean and provide KAGAS (Korean Automatic Grammatical error Annotation System), which can automatically annotate error types from parallel corpora. We use KAGAS on our datasets to make an evaluation benchmark for Korean, and present baseline models trained from our datasets. We show that the model trained with our datasets significantly outperforms the currently used statistical Korean GEC system (Hanspell) on a wider range of error types, demonstrating the diversity and usefulness of the datasets. The implementations and datasets are open-sourced.",
+    "authors": [
+      "Soyoung Yoon",
+      "Sungjoon Park",
+      "Gyuwan Kim",
+      "Junhee Cho",
+      "Kihyo Park",
+      "Gyu Tae Kim",
+      "Minjoon Seo",
+      "Alice Oh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.371",
+    "point2d": [
+      -39.641902923583984,
+      13.820283889770508
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 373,
+    "title": "FLamE: Few-shot Learning from Natural Language Explanations",
+    "abstract": "Natural language explanations have the potential to provide rich information that in principle guides model reasoning.Yet, recent work by Lampinen et al. has shown limited utility of natural language explanations in improving classification.To effectively learn from explanations, we present FLamE, a two-stage few-shot learning framework that first generates explanations using GPT-3, and then fine-tunes a smaller model (e.g., RoBERTa) with generated explanations.Our experiments on natural language inference demonstrate effectiveness over strong baselines, increasing accuracy by 17.6% over GPT-3 Babbage and 5.7% over GPT-3 Davinci in e-SNLI.Despite improving classification performance, human evaluation surprisingly reveals that the majority of generated explanations does not adequately justify classification decisions.Additional analyses point to the important role of label-specific cues (e.g., \u201cnot know\u201d for the neutral label) in generated explanations.",
+    "authors": [
+      "Yangqiaoyu Zhou",
+      "Yiming Zhang",
+      "Chenhao Tan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.372",
+    "point2d": [
+      28.577045440673828,
+      -9.320280075073242
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 374,
+    "title": "Learning Symbolic Rules over Abstract Meaning Representations for Textual Reinforcement Learning",
+    "abstract": "Text-based reinforcement learning agents have predominantly been neural network-based models with embeddings-based representation, learning uninterpretable policies that often do not generalize well to unseen games. On the other hand, neuro-symbolic methods, specifically those that leverage an intermediate formal representation, are gaining significant attention in language understanding tasks. This is because of their advantages ranging from inherent interpretability, the lesser requirement of training data, and being generalizable in scenarios with unseen data. Therefore, in this paper, we propose a modular, NEuro-Symbolic Textual Agent (NESTA) that combines a generic semantic parser with a rule induction system to learn abstract interpretable rules as policies. Our experiments on established text-based game benchmarks show that the proposed NESTA method outperforms deep reinforcement learning-based techniques by achieving better generalization to unseen test games and learning from fewer training interactions.",
+    "authors": [
+      "Subhajit Chaudhury",
+      "Sarathkrishna Swaminathan",
+      "Daiki Kimura",
+      "Prithviraj Sen",
+      "Keerthiram Murugesan",
+      "Rosario Uceda-Sosa",
+      "Michiaki Tatsubori",
+      "Achille Fokoue",
+      "Pavan Kapanipathi",
+      "Asim Munawar",
+      "Alexander Gray"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.373",
+    "point2d": [
+      52.28125762939453,
+      -5.47199821472168
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 375,
+    "title": "Counterfactual Debiasing for Fact Verification",
+    "abstract": "Fact verification aims to automatically judge the veracity of a claim according to several pieces of evidence. Due to the manual construction of datasets, spurious correlations between claim patterns and its veracity (i.e., biases) inevitably exist. Recent studies show that models usually learn such biases instead of understanding the semantic relationship between the claim and evidence. Existing debiasing works can be roughly divided into data-augmentation-based and weight-regularization-based pipeline, where the former is inflexible and the latter relies on the uncertain output on the training stage. Unlike previous works, we propose a novel method from a counterfactual view, namely CLEVER, which is augmentation-free and mitigates biases on the inference stage. Specifically, we train a claim-evidence fusion model and a claim-only model independently. Then, we obtain the final prediction via subtracting output of the claim-only model from output of the claim-evidence fusion model, which counteracts biases in two outputs so that the unbiased part is highlighted. Comprehensive experiments on several datasets have demonstrated the effectiveness of CLEVER.",
+    "authors": [
+      "Weizhi Xu",
+      "Qiang Liu",
+      "Shu Wu",
+      "Liang Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.374",
+    "point2d": [
+      30.118663787841797,
+      8.110954284667969
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 376,
+    "title": "What social attitudes about gender does BERT encode? Leveraging insights from psycholinguistics",
+    "abstract": "Much research has sought to evaluate the degree to which large language models reflect social biases. We complement such work with an approach to elucidating the connections between language model predictions and people\u2019s social attitudes. We show how word preferences in a large language model reflect social attitudes about gender, using two datasets from human experiments that found differences in gendered or gender neutral word choices by participants with differing views on gender (progressive, moderate, or conservative). We find that the language model BERT takes into account factors that shape human lexical choice of such language, but may not weigh those factors in the same way people do. Moreover, we show that BERT\u2019s predictions most resemble responses from participants with moderate to conservative views on gender. Such findings illuminate how a language model: (1) may differ from people in how it deploys words that signal gender, and (2) may prioritize some social attitudes over others.",
+    "authors": [
+      "Julia Watson",
+      "Barend Beekhuizen",
+      "Suzanne Stevenson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.375",
+    "point2d": [
+      24.50105857849121,
+      34.464569091796875
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 377,
+    "title": "Rethinking Multimodal Entity and Relation Extraction from a Translation Point of View",
+    "abstract": "We revisit the multimodal entity and relation extraction from a translation point of view. Special attention is paid on the misalignment issue in text-image datasets which may mislead the learning. We are motivated by the fact that the cross-modal misalignment is a similar problem of cross-lingual divergence issue in machine translation. The problem can then be transformed and existing solutions can be borrowed by treating a text and its paired image as the translation to each other. We implement a multimodal back-translation using diffusion-based generative models for pseudo-paralleled pairs and a divergence estimator by constructing a high-resource corpora as a bridge for low-resource learners. Fine-grained confidence scores are generated to indicate both types and degrees of alignments with which better representations are obtained. The method has been validated in the experiments by outperforming 14 state-of-the-art methods in both entity and relation extraction tasks. The source code is available at https://github.com/thecharm/TMR.",
+    "authors": [
+      "Changmeng Zheng",
+      "Junhao Feng",
+      "Yi Cai",
+      "Xiaoyong Wei",
+      "Qing Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.376",
+    "point2d": [
+      -65.91241455078125,
+      37.26371765136719
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 378,
+    "title": "Annotating and Detecting Fine-grained Factual Errors for Dialogue Summarization",
+    "abstract": "A series of datasets and models have been proposed for summaries generated for well-formatted documents such as news articles. Dialogue summaries, however, have been under explored. In this paper, we present the first dataset with fine-grained factual error annotations named DIASUMFACT. We define fine-grained factual error detection as a sentence-level multi-label classification problem, and weevaluate two state-of-the-art (SOTA) models on our dataset. Both models yield sub-optimal results, with a macro-averaged F1 score of around 0.25 over 6 error classes. We further propose an unsupervised model ENDERANKER via candidate ranking using pretrained encoder-decoder models. Our model performs on par with the SOTA models while requiring fewer resources. These observations confirm the challenges in detecting factual errors from dialogue summaries, which call for further studies, for which our dataset and results offer a solid foundation.",
+    "authors": [
+      "Rongxin Zhu",
+      "Jianzhong Qi",
+      "Jey Han Lau"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.377",
+    "point2d": [
+      -5.614867210388184,
+      51.384403228759766
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 379,
+    "title": "Improving the Robustness of Summarization Systems with Dual Augmentation",
+    "abstract": "A robust summarization system should be able to capture the gist of the document, regardless of the specific word choices or noise in the input.In this work, we first explore the summarization models\u2019 robustness against perturbations including word-level synonym substitution and noise.To create semantic-consistent substitutes, we propose a SummAttacker, which is an efficient approach to generating adversarial samples based on pre-trained language models.Experimental results show that state-of-the-art summarization models have a significant decrease in performance on adversarial and noisy test sets.Next, we analyze the vulnerability of the summarization systems and explore improving the robustness by data augmentation.Specifically, the first vulnerability factor we found is the low diversity of the training inputs.Correspondingly, we expose the encoder to more diverse cases created by SummAttacker in the input space.The second factor is the vulnerability of the decoder, and we propose an augmentation in the latent space of the decoder to improve its robustness.Concretely, we create virtual cases by manifold softmixing two decoder hidden states of similar semantic meanings.Experimental results on Gigaword and CNN/DM datasets demonstrate that our approach achieves significant improvements over strong baselines and exhibits higher robustness on noisy, attacked, and clean datasets",
+    "authors": [
+      "Xiuying Chen",
+      "Guodong Long",
+      "Chongyang Tao",
+      "Mingzhe Li",
+      "Xin Gao",
+      "Chengqi Zhang",
+      "Xiangliang Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.378",
+    "point2d": [
+      -14.370803833007812,
+      39.09899139404297
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 380,
+    "title": "Interpretable Math Word Problem Solution Generation via Step-by-step Planning",
+    "abstract": "Solutions to math word problems (MWPs) with step-by-step explanations are valuable, especially in education, to help students better comprehend problem-solving strategies.Most existing approaches only focus on obtaining the final correct answer. A few recent approaches leverage intermediate solution steps to improve final answer correctness but often cannot generate coherent steps with a clear solution strategy.Contrary to existing work, we focus on improving the correctness and coherence of the intermediate solutions steps.We propose a step-by-step planning approach for intermediate solution generation, which strategically plans the generation of the next solution step based on the MWP and the previous solution steps.Our approach first plans the next step by predicting the necessary math operation needed to proceed, given history steps, then generates the next step, token-by-token, by prompting a language model with the predicted math operation.Experiments on the GSM8K dataset demonstrate that our approach improves the accuracy and interpretability of the solution on both automatic metrics and human evaluation.",
+    "authors": [
+      "Mengxue Zhang",
+      "Zichao Wang",
+      "Zhichao Yang",
+      "Weiqi Feng",
+      "Andrew Lan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.379",
+    "point2d": [
+      46.03343963623047,
+      -18.242416381835938
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 381,
+    "title": "TemplateGEC: Improving Grammatical Error Correction with Detection Template",
+    "abstract": "Grammatical error correction (GEC) can be divided into sequence-to-edit (Seq2Edit) and sequence-to-sequence (Seq2Seq) frameworks, both of which have their pros and cons. To utilize the strengths and make up for the shortcomings of these frameworks, this paper proposes a novel method, TemplateGEC, which capitalizes on the capabilities of both Seq2Edit and Seq2Seq frameworks in error detection and correction respectively. TemplateGEC utilizes the detection labels from a Seq2Edit model, to construct the template as the input. A Seq2Seq model is employed to enforce consistency between the predictions of different templates by utilizing consistency learning. Experimental results on the Chinese NLPCC18, English BEA19 and CoNLL14 benchmarks show the effectiveness and robustness of TemplateGEC.Further analysis reveals the potential of our method in performing human-in-the-loop GEC. Source code and scripts are available at https://github.com/li-aolong/TemplateGEC.",
+    "authors": [
+      "Yinghao Li",
+      "Xuebo Liu",
+      "Shuo Wang",
+      "Peiyuan Gong",
+      "Derek F. Wong",
+      "Yang Gao",
+      "Heyan Huang",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.380",
+    "point2d": [
+      -37.89702224731445,
+      13.514839172363281
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 382,
+    "title": "Deep Model Compression Also Helps Models Capture Ambiguity",
+    "abstract": "Natural language understanding (NLU) tasks face a non-trivial amount of ambiguous samples where veracity of their labels is debatable among annotators. NLU models should thus account for such ambiguity, but they approximate the human opinion distributions quite poorly and tend to produce over-confident predictions. To address this problem, we must consider how to exactly capture the degree of relationship between each sample and its candidate classes. In this work, we propose a novel method with deep model compression and show how such relationship can be accounted for. We see that more reasonably represented relationships can be discovered in the lower layers and that validation accuracies are converging at these layers, which naturally leads to layer pruning. We also see that distilling the relationship knowledge from a lower layer helps models produce better distribution. Experimental results demonstrate that our method makes substantial improvement on quantifying ambiguity without gold distribution labels. As positive side-effects, our method is found to reduce the model size significantly and improve latency, both attractive aspects of NLU products.",
+    "authors": [
+      "Hancheol Park",
+      "Jong Park"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.381",
+    "point2d": [
+      1.744222640991211,
+      -10.286844253540039
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 383,
+    "title": "Are Experts Needed? On Human Evaluation of Counselling Reflection Generation",
+    "abstract": "Reflection is a crucial counselling skill where the therapist conveys to the client their interpretation of what the client said. Language models have recently been used to generate reflections automatically, but human evaluation is challenging, particularly due to the cost of hiring experts. Laypeople-based evaluation is less expensive and easier to scale, but its quality is unknown for reflections. Therefore, we explore whether laypeople can be an alternative to experts in evaluating a fundamental quality aspect: coherence and context-consistency. We do so by asking a group of laypeople and a group of experts to annotate both synthetic reflections and human reflections from actual therapists. We find that both laypeople and experts are reliable annotators and that they have moderate-to-strong inter-group correlation, which shows that laypeople can be trusted for such evaluations. We also discover that GPT-3 mostly produces coherent and consistent reflections, and we explore changes in evaluation results when the source of synthetic reflections changes to GPT-3 from the less powerful GPT-2.",
+    "authors": [
+      "Zixiu Wu",
+      "Simone Balloccu",
+      "Ehud Reiter",
+      "Rim Helaoui",
+      "Diego Reforgiato Recupero",
+      "Daniele Riboni"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.382",
+    "point2d": [
+      32.89321517944336,
+      65.6187973022461
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 384,
+    "title": "PairSpanBERT: An Enhanced Language Model for Bridging Resolution",
+    "abstract": "We present PairSpanBERT, a SpanBERT-based pre-trained model specialized for bridging resolution. To this end, we design a novel pre-training objective that aims to learn the contexts in which two mentions are implicitly linked to each other from a large amount of data automatically generated either heuristically or via distance supervision with a knowledge graph. Despite the noise inherent in the automatically generated data, we achieve the best results reported to date on three evaluation datasets for bridging resolution when replacing SpanBERT with PairSpanBERT in a state-of-the-art resolver that jointly performs entity coreference resolution and bridging resolution.",
+    "authors": [
+      "Hideo Kobayashi",
+      "Yufang Hou",
+      "Vincent Ng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.383",
+    "point2d": [
+      39.696205139160156,
+      -77.40462493896484
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 385,
+    "title": "Compounding Geometric Operations for Knowledge Graph Completion",
+    "abstract": "Geometric transformations including translation, rotation, and scaling are commonly used operations in image processing. Besides, some of them are successfully used in developing effective knowledge graph embedding (KGE). Inspired by the synergy, we propose a new KGE model by leveraging all three operations in this work. Since translation, rotation, and scaling operations are cascaded to form a composite one, the new model is named CompoundE. By casting CompoundE in the framework of group theory, we show that quite a few distanced-based KGE models are special cases of CompoundE. CompoundE extends the simple distance-based scoring functions to relation-dependent compound operations on head and/or tail entities. To demonstrate the effectiveness of CompoundE, we perform three prevalent KG prediction tasks including link prediction, path query answering, and entity typing, on a range of datasets. CompoundE outperforms extant models consistently, demonstrating its effectiveness and flexibility.",
+    "authors": [
+      "Xiou Ge",
+      "Yun Cheng Wang",
+      "Bin Wang",
+      "C.-C. Jay Kuo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.384",
+    "point2d": [
+      56.06502151489258,
+      -67.63867950439453
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 386,
+    "title": "Few-shot In-context Learning on Knowledge Base Question Answering",
+    "abstract": "Question answering over knowledge bases is considered a difficult problem due to the challenge of generalizing to a wide variety of possible natural language questions. Additionally, the heterogeneity of knowledge base schema items between different knowledge bases often necessitates specialized training for different knowledge base question-answering (KBQA) datasets. To handle questions over diverse KBQA datasets with a unified training-free framework, we propose KB-BINDER, which for the first time enables few-shot in-context learning over KBQA tasks. Firstly, KB-BINDER leverages large language models like Codex to generate logical forms as the draft for a specific question by imitating a few demonstrations. Secondly, KB-BINDER grounds on the knowledge base to bind the generated draft to an executable one with BM25 score matching. The experimental results on four public heterogeneous KBQA datasets show that KB-BINDER can achieve a strong performance with only a few in-context demonstrations. Especially on GraphQA and 3-hop MetaQA, KB-BINDER can even outperform the state-of-the-art trained models. On GrailQA and WebQSP, our model is also on par with other fully-trained models. We believe KB-BINDER can serve as an important baseline for future research. We plan to release all the code and data. Our code is available at https://github.com/ltl3A87/KB-BINDER.",
+    "authors": [
+      "Tianle Li",
+      "Xueguang Ma",
+      "Alex Zhuang",
+      "Yu Gu",
+      "Yu Su",
+      "Wenhu Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.385",
+    "point2d": [
+      68.85397338867188,
+      2.7549917697906494
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 387,
+    "title": "Fact-Checking Complex Claims with Program-Guided Reasoning",
+    "abstract": "Fact-checking real-world claims often requires collecting multiple pieces of evidence and applying complex multi-step reasoning. In this paper, we present Program-Guided Fact-Checking (ProgramFC), a novel fact-checking model that decomposes complex claims into simpler sub-tasks that can be solved using a shared library of specialized functions. We first leverage the in-context learning ability of large language models to generate reasoning programs to guide the verification process. Afterward, we execute the program by delegating each sub-task to the corresponding sub-task handler. This process makes our model both explanatory and data-efficient, providing clear explanations of its reasoning process and requiring minimal training data. We evaluate ProgramFC on two challenging fact-checking datasets and show that it outperforms seven fact-checking baselines across different settings of evidence availability, with explicit output programs that benefit human debugging. Our codes and data are publicly available at https://github.com/mbzuai-nlp/ProgramFC.",
+    "authors": [
+      "Liangming Pan",
+      "Xiaobao Wu",
+      "Xinyuan Lu",
+      "Anh Tuan Luu",
+      "William Yang Wang",
+      "Min-Yen Kan",
+      "Preslav Nakov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.386",
+    "point2d": [
+      62.54692840576172,
+      -10.386832237243652
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 388,
+    "title": "Patton: Language Model Pretraining on Text-Rich Networks",
+    "abstract": "A real-world text corpus sometimes comprises not only text documents, but also semantic links between them (e.g., academic papers in a bibliographic network are linked by citations and co-authorships).Text documents and semantic connections form a text-rich network, which empowers a wide range of downstream tasks such as classification and retrieval.However, pretraining methods for such structures are still lacking, making it difficult to build one generic model that can be adapted to various tasks on text-rich networks.Current pretraining objectives, such as masked language modeling, purely model texts and do not take inter-document structure information into consideration.To this end, we propose our PretrAining on TexT-Rich NetwOrk framework Patton.Patton includes two pretraining strategies: network-contextualized masked language modeling and masked node prediction, to capture the inherent dependency between textual attributes and network structure.We conduct experiments on four downstream tasks in five datasets from both academic and e-commerce domains, where Patton outperforms baselines significantly and consistently.",
+    "authors": [
+      "Bowen Jin",
+      "Wentao Zhang",
+      "Yu Zhang",
+      "Yu Meng",
+      "Xinyang Zhang",
+      "Qi Zhu",
+      "Jiawei Han"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.387",
+    "point2d": [
+      -6.534233093261719,
+      -33.735774993896484
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 389,
+    "title": "Soft Language Clustering for Multilingual Model Pre-training",
+    "abstract": "Multilingual pre-trained language models have demonstrated impressive (zero-shot) cross-lingual transfer abilities, however, their performance is hindered when the target language has distant typologyfrom the source language or when pre-training data is limited in size.In this paper, we propose XLM-P, a method that contextually retrieves prompts as flexible guidance for encoding instances conditionally. Our space-efficient and model-agnostic XLM-P approach enables (1) lightweight modeling of language-invariant and language-specific knowledge across languages, and (2) easy integration with other multilingual pre-training methods. On the tasks of XTREME, which include text classification, sequence labeling, question answering, and sentence retrieval, both base- and large-size language models pre-trained with our proposed method exhibit consistent performance improvement. Furthermore, it provides substantial advantages for low-resource languages in unsupervised sentence retrieval and for target languages that differ greatly from the source language in cross-lingual transfer.",
+    "authors": [
+      "Jiali Zeng",
+      "Yufan Jiang",
+      "Yongjing Yin",
+      "Yi Jing",
+      "Fandong Meng",
+      "Binghuai Lin",
+      "Yunbo Cao",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.388",
+    "point2d": [
+      -15.206243515014648,
+      -28.207012176513672
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 390,
+    "title": "Curriculum Learning for Graph Neural Networks: A Multiview Competence-based Approach",
+    "abstract": "A curriculum is a planned sequence of learning materials and an effective one can make learning efficient and effective for both humans and machines. Recent studies developed effective data-driven curriculum learning approaches for training graph neural networks in language applications. However, existing curriculum learning approaches often employ a single criterion of difficulty in their training paradigms. In this paper, we propose a new perspective on curriculum learning by introducing a novel approach that builds on graph complexity formalisms (as difficulty criteria) and model competence during training. The model consists of a scheduling scheme which derives effective curricula by accounting for different views of sample difficulty and model competence during training. The proposed solution advances existing research in curriculum learning for graph neural networks with the ability to incorporate a fine-grained spectrum of graph difficulty criteria in their training paradigms. Experimental results on real-world link prediction and node classification tasks illustrate the effectiveness of the proposed approach.",
+    "authors": [
+      "Nidhi Vakil",
+      "Hadi Amiri"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.389",
+    "point2d": [
+      -11.005023002624512,
+      -40.187747955322266
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 391,
+    "title": "When and how to paraphrase for named entity recognition?",
+    "abstract": "While paraphrasing is a promising approach for data augmentation in classification tasks, its effect on named entity recognition (NER) is not investigated systematically due to the difficulty of span-level label preservation. In this paper, we utilize simple strategies to annotate entity spans in generations and compare established and novel methods of paraphrasing in NLP such as back translation, specialized encoder-decoder models such as Pegasus, and GPT-3 variants for their effectiveness in improving downstream performance for NER across different levels of gold annotations and paraphrasing strength on 5 datasets. We thoroughly explore the influence of paraphrasers, and dynamics between paraphrasing strength and gold dataset size on the NER performance with visualizations and statistical testing. We find that the choice of the paraphraser greatly impacts NER performance, with one of the larger GPT-3 variants exceedingly capable of generating high quality paraphrases, yielding statistically significant improvements in NER performance with increasing paraphrasing strength, while other paraphrasers show more mixed results. Additionally, inline auto annotations generated by larger GPT-3 are strictly better than heuristic based annotations. We also find diminishing benefits of paraphrasing as gold annotations increase for most datasets. Furthermore, while most paraphrasers promote entity memorization in NER, the proposed GPT-3 configuration performs most favorably among the compared paraphrasers when tested on unseen entities, with memorization reducing further with paraphrasing strength. Finally, we explore mention replacement using GPT-3, which provides additional benefits over base paraphrasing for specific datasets.",
+    "authors": [
+      "Saket Sharma",
+      "Aviral Joshi",
+      "Yiyun Zhao",
+      "Namrata Mukhija",
+      "Hanoz Bhathena",
+      "Prateek Singh",
+      "Sashank Santhanam"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.390",
+    "point2d": [
+      30.549259185791016,
+      -82.39459228515625
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 392,
+    "title": "UniEvent: Unified Generative Model with Multi-Dimensional Prefix for Zero-Shot Event-Relational Reasoning",
+    "abstract": "Reasoning about events and their relations attracts surging research efforts since it is regarded as an indispensable ability to fulfill various event-centric or common-sense reasoning tasks. However, these tasks often suffer from limited data availability due to the labor-intensive nature of their annotations. Consequently, recent studies have explored knowledge transfer approaches within a multi-task learning framework to address this challenge. Although such methods have achieved acceptable results, such brute-force solutions struggle to effectively transfer event-relational knowledge due to the vast array of inter-event relations (e.g. temporal, causal, conditional) and reasoning formulations (e.g. discriminative, abductive, ending prediction). To enhance knowledge transfer and enable zero-shot generalization among various combinations, in this work we propose a novel unified framework, called UNIEVENT. Inspired by prefix-based multitask learning, our approach organizes event relational reasoning tasks into a coordinate system with multiple axes, representing inter-event relations and reasoning formulations. We then train a unified text-to-text generative model that utilizes coordinate-assigning prefixes for each task. By leveraging our adapted prefixes, our unified model achieves state-of-the-art or competitive performance on both zero-shot and supervised reasoning tasks, as demonstrated in extensive experiments",
+    "authors": [
+      "Zhengwei Tao",
+      "Zhi Jin",
+      "Haiyan Zhao",
+      "Chengfeng Dou",
+      "Yongqiang Zhao",
+      "Tao Shen",
+      "Chongyang Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.391",
+    "point2d": [
+      51.36549758911133,
+      -49.05099868774414
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 393,
+    "title": "Are Machine Rationales (Not) Useful to Humans? Measuring and Improving Human Utility of Free-text Rationales",
+    "abstract": "Among the remarkable emergent capabilities of large language models (LMs) is free-text rationalization; beyond certain scale, large LMs are capable of generating seemingly useful rationalizations, which in turn, can dramatically enhance their performances on leaderboards. This phenomenon raises a question: can machine generated rationales also be useful for humans, especially when lay humans try to answer questions based on those machine rationales? We observe that human utility of existing rationales is far from satisfactory and expensive to estimate with human studies. Existing metrics like task performance of the LM generating the rationales or similarity between generated and gold rationales are not good indicators of their human utility. While we observe that certain properties of rationales like conciseness and novelty are correlated with their human utility, estimating them without human involvement is challenging. We show that, by estimating a rationale\u2019s helpfulness in answering similar unseen instances, we can measure its human utility to a better extent. We also translate this finding into an automated score, Gen-U, that we propose, which can help improve LMs\u2019 ability to generate rationales with better human utility, while maintaining most of its task performance. Lastly, we release all code and collected data with this project.",
+    "authors": [
+      "Brihi Joshi",
+      "Ziyi Liu",
+      "Sahana Ramnath",
+      "Aaron Chan",
+      "Zhewei Tong",
+      "Shaoliang Nie",
+      "Qifan Wang",
+      "Yejin Choi",
+      "Xiang Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.392",
+    "point2d": [
+      32.53847122192383,
+      -7.683774948120117
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 394,
+    "title": "Automatic Annotation of Direct Speech in Written French Narratives",
+    "abstract": "The automatic annotation of direct speech (AADS) in written text has been often used in computational narrative understanding. Methods based on either rules or deep neural networks have been explored, in particular for English or German languages. Yet, for French, our target language, not many works exist. Our goal is to create a unified framework to design and evaluate AADS models in French. For this, we consolidated the largest-to-date French narrative dataset annotated with DS per word; we adapted various baselines for sequence labelling or from AADS in other languages; and we designed and conducted an extensive evaluation focused on generalisation. Results show that the task still requires substantial efforts and emphasise characteristics of each baseline. Although this framework could be improved, it is a step further to encourage more research on the topic.",
+    "authors": [
+      "No\u00e9 Durandard",
+      "Viet Anh Tran",
+      "Gaspard Michel",
+      "Elena Epure"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.393",
+    "point2d": [
+      -6.999447345733643,
+      51.66083526611328
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 395,
+    "title": "Automatic Creation of Named Entity Recognition Datasets by Querying Phrase Representations",
+    "abstract": "Most weakly supervised named entity recognition (NER) models rely on domain-specific dictionaries provided by experts. This approach is infeasible in many domains where dictionaries do not exist. While a phrase retrieval model was used to construct pseudo-dictionaries with entities retrieved from Wikipedia automatically in a recent study, these dictionaries often have limited coverage because the retriever is likely to retrieve popular entities rather than rare ones. In this study, we present a novel framework, HighGEN, that generates NER datasets with high-coverage pseudo-dictionaries. Specifically, we create entity-rich dictionaries with a novel search method, called phrase embedding search, which encourages the retriever to search a space densely populated with various entities. In addition, we use a new verification process based on the embedding distance between candidate entity mentions and entity types to reduce the false-positive noise in weak labels generated by high-coverage dictionaries. We demonstrate that HighGEN outperforms the previous best model by an average F1 score of 4.7 across five NER benchmark datasets.",
+    "authors": [
+      "Hyunjae Kim",
+      "Jaehyo Yoo",
+      "Seunghyun Yoon",
+      "Jaewoo Kang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.394",
+    "point2d": [
+      32.11530685424805,
+      -83.18264770507812
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 396,
+    "title": "Dynamic Transformers Provide a False Sense of Efficiency",
+    "abstract": "Despite much success in natural language processing (NLP), pre-trained language models typically lead to a high computational cost during inference. Multi-exit is a mainstream approach to address this issue by making a trade-off between efficiency and accuracy, where the saving of computation comes from an early exit. However, whether such saving from early-exiting is robust remains unknown. Motivated by this, we first show that directly adapting existing adversarial attack approaches targeting model accuracy cannot significantly reduce inference efficiency. To this end, we propose a simple yet effective attacking framework, SAME, a novel slowdown attack framework on multi-exit models, which is specially tailored to reduce the efficiency of the multi-exit models. By leveraging the multi-exit models\u2019 design characteristics, we utilize all internal predictions to guide the adversarial sample generation instead of merely considering the final prediction. Experiments on the GLUE benchmark show that SAME can effectively diminish the efficiency gain of various multi-exit models by 80% on average, convincingly validating its effectiveness and generalization ability.",
+    "authors": [
+      "Yiming Chen",
+      "Simin Chen",
+      "Zexin Li",
+      "Wei Yang",
+      "Cong Liu",
+      "Robby Tan",
+      "Haizhou Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.395",
+    "point2d": [
+      1.2889175415039062,
+      6.470900058746338
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 397,
+    "title": "Empowering Cross-lingual Behavioral Testing of NLP Models with Typological Features",
+    "abstract": "A challenge towards developing NLP systems for the world\u2019s languages is understanding how they generalize to typological differences relevant for real-world applications. To this end, we propose M2C, a morphologically-aware framework for behavioral testing of NLP models. We use M2C to generate tests that probe models\u2019 behavior in light of specific linguistic features in 12 typologically diverse languages. We evaluate state-of-the-art language models on the generated tests. While models excel at most tests in English, we highlight generalization failures to specific typological characteristics such as temporal expressions in Swahili and compounding possessives in Finish. Our findings motivate the development of models that address these blind spots.",
+    "authors": [
+      "Ester Hlavnova",
+      "Sebastian Ruder"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.396",
+    "point2d": [
+      -32.96048355102539,
+      -40.9771842956543
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 398,
+    "title": "Local Byte Fusion for Neural Machine Translation",
+    "abstract": "Subword tokenization schemes are the dominant technique used in current NLP models. However, such schemes can be rigid and tokenizers built on one corpus may not adapt well to other parallel corpora. It has also been observed that in multilingual corpora, subword tokenization schemes oversegment low-resource languages, leading to a drop in translation performance. An alternative to subword tokenizers is byte-based tokenization, i.e., tokenization into byte sequences using the UTF-8 encoding scheme. Byte tokens often represent inputs at a sub-character granularity, i.e., one character can be represented by a span of byte tokens. This results in much longer byte sequences that are hard to interpret without aggregating local information from multiple byte tokens. In this paper, we propose a Local Byte Fusion (LOBEF) method for byte-based machine translation\u2014utilizing byte n-gram and word boundaries\u2014to aggregate local semantic information. Extensive experiments on multilingual translation, zero-shot cross-lingual transfer, and domain adaptation reveal a consistent improvement over vanilla byte-based models. Further analysis also indicates that our byte-based models are parameter-efficient and perform competitive to subword models.",
+    "authors": [
+      "Makesh Narsimhan Sreedhar",
+      "Xiangpeng Wan",
+      "Yu Cheng",
+      "Junjie Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.397",
+    "point2d": [
+      -67.2159423828125,
+      -11.789796829223633
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 399,
+    "title": "Where\u2019s the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation",
+    "abstract": "Many NLP pipelines split text into sentences as one of the crucial preprocessing steps. Prior sentence segmentation tools either rely on punctuation or require a considerable amount of sentence-segmented training data: both central assumptions might fail when porting sentence segmenters to diverse languages on a massive scale. In this work, we thus introduce a multilingual punctuation-agnostic sentence segmentation method, currently covering 85 languages, trained in a self-supervised fashion on unsegmented text, by making use of newline characters which implicitly perform segmentation into paragraphs. We further propose an approach that adapts our method to the segmentation in a given corpus by using only a small number (64-256) of sentence-segmented examples. The main results indicate that our method outperforms all the prior best sentence-segmentation tools by an average of 6.1% F1 points. Furthermore, we demonstrate that proper sentence segmentation has a point: the use of a (powerful) sentence segmenter makes a considerable difference for a downstream application such as machine translation (MT). By using our method to match sentence segmentation to the segmentation used during training of MT models, we achieve an average improvement of 2.3 BLEU points over the best prior segmentation tool, as well as massive gains over a trivial segmenter that splits text into equally-sized blocks.",
+    "authors": [
+      "Benjamin Minixhofer",
+      "Jonas Pfeiffer",
+      "Ivan Vuli\u0107"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.398",
+    "point2d": [
+      -48.865692138671875,
+      9.801627159118652
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 400,
+    "title": "Multi-target Backdoor Attacks for Code Pre-trained Models",
+    "abstract": "Backdoor attacks for neural code models have gained considerable attention due to the advancement of code intelligence. However, most existing works insert triggers into task-specific data for code-related downstream tasks, thereby limiting the scope of attacks. Moreover, the majority of attacks for pre-trained models are designed for understanding tasks. In this paper, we propose task-agnostic backdoor attacks for code pre-trained models. Our backdoored model is pre-trained with two learning strategies (i.e., Poisoned Seq2Seq learning and token representation learning) to support the multi-target attack of downstream code understanding and generation tasks. During the deployment phase, the implanted backdoors in the victim models can be activated by the designed triggers to achieve the targeted attack. We evaluate our approach on two code understanding tasks and three code generation tasks over seven datasets. Extensive experimental results demonstrate that our approach effectively and stealthily attacks code-related downstream tasks.",
+    "authors": [
+      "Yanzhou Li",
+      "Shangqing Liu",
+      "Kangjie Chen",
+      "Xiaofei Xie",
+      "Tianwei Zhang",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.399",
+    "point2d": [
+      -5.819383144378662,
+      -58.955345153808594
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 401,
+    "title": "Learning Better Masking for Better Language Model Pre-training",
+    "abstract": "Masked Language Modeling (MLM) has been widely used as the denoising objective in pre-training language models (PrLMs). Existing PrLMs commonly adopt a Random-Token Masking strategy where a fixed masking ratio is applied and different contents are masked by an equal probability throughout the entire training. However, the model may receive complicated impact from pre-training status, which changes accordingly as training time goes on. In this paper, we show that such time-invariant MLM settings on masking ratio and masked content are unlikely to deliver an optimal outcome, which motivates us to explore the influence of time-variant MLM settings. We propose two scheduled masking approaches that adaptively tune the masking ratio and masked content in different training stages, which improves the pre-training efficiency and effectiveness verified on the downstream tasks. Our work is a pioneer study on time-variant masking strategy on ratio and content and gives a better understanding of how masking ratio and masked content influence the MLM pre-training.",
+    "authors": [
+      "Dongjie Yang",
+      "Zhuosheng Zhang",
+      "Hai Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.400",
+    "point2d": [
+      -37.26493835449219,
+      -21.91534996032715
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 402,
+    "title": "VisText: A Benchmark for Semantically Rich Chart Captioning",
+    "abstract": "Captions that describe or explain charts help improve recall and comprehension of the depicted data and provide a more accessible medium for people with visual disabilities. However, current approaches for automatically generating such captions struggle to articulate the perceptual or cognitive features that are the hallmark of charts (e.g., complex trends and patterns). In response, we introduce VisText: a dataset of 12,441 pairs of charts and captions that describe the charts\u2019 construction, report key statistics, and identify perceptual and cognitive phenomena. In VisText, a chart is available as three representations: a rasterized image, a backing data table, and a scene graph\u2014a hierarchical representation of a chart\u2019s visual elements akin to a web page\u2019s Document Object Model (DOM). To evaluate the impact of VisText, we fine-tune state-of-the-art language models on our chart captioning task and apply prefix-tuning to produce captions that vary the semantic content they convey. Our models generate coherent, semantically rich captions and perform on par with state-of-the-art chart captioning models across machine translation and text generation metrics. Through qualitative analysis, we identify six broad categories of errors that our models make that can inform future work.",
+    "authors": [
+      "Benny Tang",
+      "Angie Boggust",
+      "Arvind Satyanarayan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.401",
+    "point2d": [
+      -49.73137283325195,
+      49.941375732421875
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 403,
+    "title": "Byte-Level Grammatical Error Correction Using Synthetic and Curated Corpora",
+    "abstract": "Grammatical error correction (GEC) is the task of correcting typos, spelling, punctuation and grammatical issues in text. Approaching the problem as a sequence-to-sequence task, we compare the use of a common subword unit vocabulary and byte-level encoding. Initial synthetic training data is created using an error-generating pipeline, and used for finetuning two subword-level models and one byte-level model. Models are then finetuned further on hand-corrected error corpora, including texts written by children, university students, dyslexic and second-language writers, and evaluated over different error types and error origins. We show that a byte-level model enables higher correction quality than a subword approach, not only for simple spelling errors, but also for more complex semantic, stylistic and grammatical issues. In particular, initial training on synthetic corpora followed by finetuning on a relatively small parallel corpus of real-world errors helps the byte-level model correct a wide range of commonly occurring errors. Our experiments are run for the Icelandic language but should hold for other similar languages, and in particular to morphologically rich ones.",
+    "authors": [
+      "Svanhv\u00edt Lilja Ing\u00f3lfsd\u00f3ttir",
+      "Petur Ragnarsson",
+      "Haukur J\u00f3nsson",
+      "Haukur Simonarson",
+      "Vilhjalmur Thorsteinsson",
+      "V\u00e9steinn Sn\u00e6bjarnarson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.402",
+    "point2d": [
+      -38.56435012817383,
+      14.5287504196167
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 404,
+    "title": "Multi-Level Knowledge Distillation for Out-of-Distribution Detection in Text",
+    "abstract": "Self-supervised representation learning has proved to be a valuable component for out-of-distribution (OoD) detection with only the texts of in-distribution (ID) examples. These approaches either train a language model from scratch or fine-tune a pre-trained language model using ID examples, and then take the perplexity output by the language model as OoD scores. In this paper, we analyze the complementary characteristic of both methods and propose a multi-level knowledge distillation approach that integrates their strengths while mitigating their limitations. Specifically, we use a fine-tuned model as the teacher to teach a randomly initialized student model on the ID examples. Besides the prediction layer distillation, we present a similarity-based intermediate layer distillation method to thoroughly explore the representation space of the teacher model. In this way, the learned student can better represent the ID data manifold while gaining a stronger ability to map OoD examples outside the ID data manifold with the regularization inherited from pre-training. Besides, the student model sees only ID examples during parameter learning, further promoting more distinguishable features for OoD detection. We conduct extensive experiments over multiple benchmark datasets, i.e., CLINC150, SST, ROSTD, 20 NewsGroups, and AG News; showing that the proposed method yields new state-of-the-art performance. We also explore its application as an AIGC detector to distinguish answers generated by ChatGPT and human experts. It is observed that our model exceeds human evaluators in the pair-expert task on the Human ChatGPT Comparison Corpus.",
+    "authors": [
+      "Qianhui Wu",
+      "Huiqiang Jiang",
+      "Haonan Yin",
+      "B\u00f6rje Karlsson",
+      "Chin-Yew Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.403",
+    "point2d": [
+      -2.9768643379211426,
+      -6.955269813537598
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 405,
+    "title": "Peeking inside the black box: A Commonsense-aware Generative Framework for Explainable Complaint Detection",
+    "abstract": "Complaining is an illocutionary act in which the speaker communicates his/her dissatisfaction with a set of circumstances and holds the hearer (the complainee) answerable, directly or indirectly. Considering breakthroughs in machine learning approaches, the complaint detection task has piqued the interest of the natural language processing (NLP) community. Most of the earlier studies failed to justify their findings, necessitating the adoption of interpretable models that can explain the model\u2019s output in real time. We introduce an explainable complaint dataset, X-CI, the first benchmark dataset for explainable complaint detection. Each instance in the X-CI dataset is annotated with five labels: complaint label, emotion label, polarity label, complaint severity level, and rationale (explainability), i.e., the causal span explaining the reason for the complaint/non-complaint label. We address the task of explainable complaint detection and propose a commonsense-aware unified generative framework by reframing the multitask problem as a text-to-text generation task. Our framework can predict the complaint cause, severity level, emotion, and polarity of the text in addition to detecting whether it is a complaint or not. We further establish the advantages of our proposed model on various evaluation metrics over the state-of-the-art models and other baselines when applied to the X-CI dataset in both full and few-shot settings.",
+    "authors": [
+      "Apoorva Singh",
+      "Raghav Jain",
+      "Prince Jha",
+      "Sriparna Saha"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.404",
+    "point2d": [
+      13.311625480651855,
+      35.19099426269531
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 406,
+    "title": "MMDialog: A Large-scale Multi-turn Dialogue Dataset Towards Multi-modal Open-domain Conversation",
+    "abstract": "Responding with multi-modal content has been recognized as an essential capability for an intelligent conversational agent. In this paper, we introduce the MMDialog dataset to facilitate multi-modal conversation better. MMDialog is composed of a curated set of 1.08 million real-world dialogues with 1.53 million unique images across 4,184 topics. MMDialog has two main and unique advantages. First, it is the largest multi-modal conversation dataset by the number of dialogues by 88x. Second, it contains massive topics to generalize the open domain. To build an engaging dialogue system with this dataset, we propose and normalize two response prediction tasks based on retrieval and generative scenarios. In addition, we build two baselines for the above tasks with state-of-the-art techniques and report their experimental performance. We also propose a novel evaluation metric MM-Relevance to measure the multi-modal responses. Our dataset is available in https://github.com/victorsungo/MMDialog.",
+    "authors": [
+      "Jiazhan Feng",
+      "Qingfeng Sun",
+      "Can Xu",
+      "Pu Zhao",
+      "Yaming Yang",
+      "Chongyang Tao",
+      "Dongyan Zhao",
+      "Qingwei Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.405",
+    "point2d": [
+      9.057121276855469,
+      73.54945373535156
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 407,
+    "title": "ByGPT5: End-to-End Style-conditioned Poetry Generation with Token-free Language Models",
+    "abstract": "State-of-the-art poetry generation systems are often complex. They either consist of task-specific model pipelines, incorporate prior knowledge in the form of manually created constraints, or both. In contrast, end-to-end models would not suffer from the overhead of having to model prior knowledge and could learn the nuances of poetry from data alone, reducing the degree of human supervision required. In this work, we investigate end-to-end poetry generation conditioned on styles such as rhyme, meter, and alliteration. We identify and address lack of training data and mismatching tokenization algorithms as possible limitations of past attempts. In particular, we successfully pre-train ByGPT5, a new token-free decoder-only language model, and fine-tune it on a large custom corpus of English and German quatrains annotated with our styles. We show that ByGPT5 outperforms other models such as mT5, ByT5, GPT-2 and ChatGPT, while also being more parameter efficient and performing favorably compared to humans. In addition, we analyze its runtime performance and demonstrate that it is not prone to memorization. We make our code, models, and datasets publicly available.",
+    "authors": [
+      "Jonas Belouadi",
+      "Steffen Eger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.406",
+    "point2d": [
+      -30.603559494018555,
+      39.77961730957031
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 408,
+    "title": "Envisioning Future from the Past: Hierarchical Duality Learning for Multi-Turn Dialogue Generation",
+    "abstract": "In this paper, we define a widely neglected property in dialogue text, duality, which is a hierarchical property that is reflected in human behaviours in daily conversations: Based on the logic in a conversation (or a sentence), people can infer follow-up utterances (or tokens) based on the previous text, and vice versa. We propose a hierarchical duality learning for dialogue (HDLD) to simulate this human cognitive ability, for generating high quality responses that connect both previous and follow-up dialogues. HDLD utilizes hierarchical dualities at token hierarchy and utterance hierarchy. HDLD maximizes the mutual information between past and future utterances. Thus, even if future text is invisible during inference, HDLD is capable of estimating future information implicitly based on dialogue history and generates both coherent and informative responses. In contrast to previous approaches that solely utilize future text as auxiliary information to encode during training, HDLD leverages duality to enable interaction between dialogue history and the future. This enhances the utilization of dialogue data, leading to the improvement in both automatic and human evaluation.",
+    "authors": [
+      "Ang Lv",
+      "Jinpeng Li",
+      "Shufang Xie",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.407",
+    "point2d": [
+      6.32056188583374,
+      67.17976379394531
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 409,
+    "title": "DualGATs: Dual Graph Attention Networks for Emotion Recognition in Conversations",
+    "abstract": "Capturing complex contextual dependencies plays a vital role in Emotion Recognition in Conversations (ERC). Previous studies have predominantly focused on speaker-aware context modeling, overlooking the discourse structure of the conversation. In this paper, we introduce Dual Graph ATtention networks (DualGATs) to concurrently consider the complementary aspects of discourse structure and speaker-aware context, aiming for more precise ERC. Specifically, we devise a Discourse-aware GAT (DisGAT) module to incorporate discourse structural information by analyzing the discourse dependencies between utterances. Additionally, we develop a Speaker-aware GAT (SpkGAT) module to incorporate speaker-aware contextual information by considering the speaker dependencies between utterances. Furthermore, we design an interaction module that facilitates the integration of the DisGAT and SpkGAT modules, enabling the effective interchange of relevant information between the two modules. We extensively evaluate our method on four datasets, and experimental results demonstrate that our proposed DualGATs surpass state-of-the-art baselines on the majority of the datasets.",
+    "authors": [
+      "Duzhen Zhang",
+      "Feilong Chen",
+      "Xiuyi Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.408",
+    "point2d": [
+      -36.309600830078125,
+      64.45307159423828
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 410,
+    "title": "Consistent Prototype Learning for Few-Shot Continual Relation Extraction",
+    "abstract": "Few-shot continual relation extraction aims to continually train a model on incrementally few-shot data to learn new relations while avoiding forgetting old ones. However, current memory-based methods are prone to overfitting memory samples, resulting in insufficient activation of old relations and limited ability to handle the confusion of similar classes. In this paper, we design a new N-way-K-shot Continual Relation Extraction (NK-CRE) task and propose a novel few-shot continual relation extraction method with Consistent Prototype Learning (ConPL) to address the aforementioned issues. Our proposed ConPL is mainly composed of three modules: 1) a prototype-based classification module that provides primary relation predictions under few-shot continual learning; 2) a memory-enhanced module designed to select vital samples and refined prototypical representations as a novel multi-information episodic memory; 3) a consistent learning module to reduce catastrophic forgetting by enforcing distribution consistency. To effectively mitigate catastrophic forgetting, ConPL ensures that the samples and prototypes in the episodic memory remain consistent in terms of classification and distribution. Additionally, ConPL uses prompt learning to extract better representations and adopts a focal loss to alleviate the confusion of similar classes. Experimental results on two commonly-used datasets show that our model consistently outperforms other competitive baselines.",
+    "authors": [
+      "Xiudi Chen",
+      "Hui Wu",
+      "Xiaodong Shi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.409",
+    "point2d": [
+      38.627357482910156,
+      -61.71060562133789
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 411,
+    "title": "Matching Pairs: Attributing Fine-Tuned Models to their Pre-Trained Large Language Models",
+    "abstract": "The wide applicability and adaptability of generative large language models (LLMs) has enabled their rapid adoption.While the pre-trained models can perform many tasks, such models are often fine-tuned to improve their performance on various downstream applications.However, this leads to issues over violation of model licenses, model theft, and copyright infringement.Moreover, recent advances show that generative technology is capable of producing harmful content which exacerbates the problems of accountability within model supply chains.Thus, we need a method to investigate how a model was trained or a piece of text was generated and what their pre-trained base model was.In this paper we take the first step to address this open problem by tracing back the origin of a given fine-tuned LLM to its corresponding pre-trained base model. We consider different knowledge levels and attribution strategies, and find that we can correctly trace back 8 out of the 10 fine tuned models with our best method.",
+    "authors": [
+      "Myles Foley",
+      "Ambrish Rawat",
+      "Taesung Lee",
+      "Yufang Hou",
+      "Gabriele Picco",
+      "Giulio Zizzo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.410",
+    "point2d": [
+      -9.34140682220459,
+      13.578027725219727
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 412,
+    "title": "Large Language Models Meet NL2Code: A Survey",
+    "abstract": "The task of generating code from a natural language description, or NL2Code, is considered a pressing and significant challenge in code intelligence. Thanks to the rapid development of pre-training techniques, surging large language models are being proposed for code, sparking the advances in NL2Code. To facilitate further research and applications in this field, in this paper, we present a comprehensive survey of 27 existing large language models for NL2Code, and also review benchmarks and metrics. We provide an intuitive comparison of all existing models on the HumanEval benchmark. Through in-depth observation and analysis, we provide some insights and conclude that the key factors contributing to the success of large language models for NL2Code are \u201cLarge Size, Premium Data, Expert Tuning\u201d. In addition, we discuss challenges and opportunities regarding the gap between models and humans. We also create a website https://nl2code.github.io to track the latest progress through crowd-sourcing. To the best of our knowledge, this is the first survey of large language models for NL2Code, and we believe it will contribute to the ongoing development of the field.",
+    "authors": [
+      "Daoguang Zan",
+      "Bei Chen",
+      "Fengji Zhang",
+      "Dianjie Lu",
+      "Bingchao Wu",
+      "Bei Guan",
+      "Wang Yongji",
+      "Jian-Guang Lou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.411",
+    "point2d": [
+      -10.309834480285645,
+      -50.86500549316406
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 413,
+    "title": "When Does Aggregating Multiple Skills with Multi-Task Learning Work? A Case Study in Financial NLP",
+    "abstract": "Multi-task learning (MTL) aims at achieving a better model by leveraging data and knowledge from multiple tasks. However, MTL does not always work \u2013 sometimes negative transfer occurs between tasks, especially when aggregating loosely related skills, leaving it an open question when MTL works. Previous studies show that MTL performance can be improved by algorithmic tricks. However, what tasks and skills should be included is less well explored. In this work, we conduct a case study in Financial NLP where multiple datasets exist for skills relevant to the domain, such as numeric reasoning and sentiment analysis. Due to the task difficulty and data scarcity in the Financial NLP domain, we explore when aggregating such diverse skills from multiple datasets with MTL can work. Our findings suggest that the key to MTL success lies in skill diversity, relatedness between tasks, and choice of aggregation size and shared capacity. Specifically, MTL works well when tasks are diverse but related, and when the size of the task aggregation and the shared capacity of the model are balanced to avoid overwhelming certain tasks.",
+    "authors": [
+      "Jingwei Ni",
+      "Zhijing Jin",
+      "Qian Wang",
+      "Mrinmaya Sachan",
+      "Markus Leippold"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.412",
+    "point2d": [
+      35.59030532836914,
+      -18.45926856994629
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 414,
+    "title": "Enhancing Grammatical Error Correction Systems with Explanations",
+    "abstract": "Grammatical error correction systems improve written communication by detecting and correcting language mistakes. To help language learners better understand why the GEC system makes a certain correction, the causes of errors (evidence words) and the corresponding error types are two key factors. To enhance GEC systems with explanations, we introduce EXPECT, a large dataset annotated with evidence words and grammatical error types. We propose several baselines and anlysis to understand this task. Furthermore, human evaluation verifies our explainable GEC system\u2019s explanations can assist second-language learners in determining whether to accept a correction suggestion and in understanding the associated grammar rule.",
+    "authors": [
+      "Yuejiao Fei",
+      "Leyang Cui",
+      "Sen Yang",
+      "Wai Lam",
+      "Zhenzhong Lan",
+      "Shuming Shi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.413",
+    "point2d": [
+      -37.057064056396484,
+      15.576496124267578
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 415,
+    "title": "Linguistic representations for fewer-shot relation extraction across domains",
+    "abstract": "Recent work has demonstrated the positive impact of incorporating linguistic representations as additional context and scaffolds on the in-domain performance of several NLP tasks. We extend this work by exploring the impact of linguistic representations on cross-domain performance in a few-shot transfer setting. An important question is whether linguistic representations enhance generalizability by providing features that function as cross-domain pivots. We focus on the task of relation extraction on three datasets of procedural text in two domains, cooking and materials science. Our approach augments a popular transformer-based architecture by alternately incorporating syntactic and semantic graphs constructed by freely available off-the-shelf tools. We examine their utility for enhancing generalization, and investigate whether earlier findings, e.g. that semantic representations can be more helpful than syntactic ones, extend to relation extraction in multiple domains. We find that while the inclusion of these graphs results in significantly higher performance in few-shot transfer, both types of graph exhibit roughly equivalent utility.",
+    "authors": [
+      "Sireesh Gururaja",
+      "Ritam Dutt",
+      "Tinglong Liao",
+      "Carolyn Ros\u00e9"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.414",
+    "point2d": [
+      41.67438507080078,
+      -58.9473762512207
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 416,
+    "title": "DarkBERT: A Language Model for the Dark Side of the Internet",
+    "abstract": "Recent research has suggested that there are clear differences in the language used in the Dark Web compared to that of the Surface Web. As studies on the Dark Web commonly require textual analysis of the domain, language models specific to the Dark Web may provide valuable insights to researchers. In this work, we introduce DarkBERT, a language model pretrained on Dark Web data. We describe the steps taken to filter and compile the text data used to train DarkBERT to combat the extreme lexical and structural diversity of the Dark Web that may be detrimental to building a proper representation of the domain. We evaluate DarkBERT and its vanilla counterpart along with other widely used language models to validate the benefits that a Dark Web domain specific model offers in various use cases. Our evaluations show that DarkBERT outperforms current language models and may serve as a valuable resource for future research on the Dark Web.",
+    "authors": [
+      "Youngjin Jin",
+      "Eugene Jang",
+      "Jian Cui",
+      "Jin-Woo Chung",
+      "Yongjae Lee",
+      "Seungwon Shin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.415",
+    "point2d": [
+      18.842586517333984,
+      27.42002296447754
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 417,
+    "title": "MDACE: MIMIC Documents Annotated with Code Evidence",
+    "abstract": "We introduce a dataset for evidence/rationale extraction on an extreme multi-label classification task over long medical documents. One such task is Computer-Assisted Coding (CAC) which has improved significantly in recent years, thanks to advances in machine learning technologies. Yet simply predicting a set of final codes for a patient encounter is insufficient as CAC systems are required to provide supporting textual evidence to justify the billing codes. A model able to produce accurate and reliable supporting evidence for each code would be a tremendous benefit. However, a human annotated code evidence corpus is extremely difficult to create because it requires specialized knowledge. In this paper, we introduce MDACE, the first publicly available code evidence dataset, which is built on a subset of the MIMIC-III clinical records. The dataset \u2013 annotated by professional medical coders \u2013 consists of 302 Inpatient charts with 3,934 evidence spans and 52 Profee charts with 5,563 evidence spans. We implemented several evidence extraction methods based on the EffectiveCAN model (Liu et al., 2021) to establish baseline performance on this dataset. MDACE can be used to evaluate code evidence extraction methods for CAC systems, as well as the accuracy and interpretability of deep learning models for multi-label classification. We believe that the release of MDACE will greatly improve the understanding and application of deep learning technologies for medical coding and document classification.",
+    "authors": [
+      "Hua Cheng",
+      "Rana Jafari",
+      "April Russell",
+      "Russell Klopfer",
+      "Edmond Lu",
+      "Benjamin Striner",
+      "Matthew Gormley"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.416",
+    "point2d": [
+      28.59925079345703,
+      -41.00163269042969
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 418,
+    "title": "Towards Zero-Shot Multilingual Transfer for Code-Switched Responses",
+    "abstract": "Recent task-oriented dialog systems have had great success in building English-based personal assistants, but extending these systems to a global audience is challenging due to the need for annotated data in the target language. An alternative approach is to leverage existing data in a high-resource language to enable cross-lingual transfer in low-resource language models. However, this type of transfer has not been widely explored in natural language response generation. In this research, we investigate the use of state-of-the-art multilingual models such as mBART and T5 to facilitate zero-shot and few-shot transfer of code-switched responses. We propose a new adapter-based framework that allows for efficient transfer by learning task-specific representations and encapsulating source and target language representations. Our framework is able to successfully transfer language knowledge even when the target language corpus is limited. We present both quantitative and qualitative analyses to evaluate the effectiveness of our approach.",
+    "authors": [
+      "Ting-Wei Wu",
+      "Changsheng Zhao",
+      "Ernie Chang",
+      "Yangyang Shi",
+      "Pierce Chuang",
+      "Vikas Chandra",
+      "Biing Juang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.417",
+    "point2d": [
+      9.100503921508789,
+      64.02862548828125
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 419,
+    "title": "One Network, Many Masks: Towards More Parameter-Efficient Transfer Learning",
+    "abstract": "Fine-tuning pre-trained language models for multiple tasks can be expensive in terms of storage.Parameter-efficient transfer learning (PETL) methods have been proposed to address this issue, but they still require a significant number of parameters when being applied to broader ranges of tasks.To achieve even greater storage reduction, we propose ProPETL, a novel method that enables efficient sharing of a single prototype PETL network (e.g. adapter, LoRA, and prefix-tuning) across layers and tasks. We learn binary masks to select different sub-networks from the prototype network and apply them as PETL modules into different layers. We find that the binary masks can determine crucial structural information from the network, which is often ignored in previous studies. Our work can also be seen as a type of pruning method, where we find that overparameterization also exists in the seemingly small PETL modules. We evaluate ProPETL on various downstream tasks and show that it can outperform other PETL methods with around 10% parameters required by the latter.",
+    "authors": [
+      "Guangtao Zeng",
+      "Peiyuan Zhang",
+      "Wei Lu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.418",
+    "point2d": [
+      -37.91044998168945,
+      -15.163444519042969
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 420,
+    "title": "Can Language Models Make Fun? A Case Study in Chinese Comical Crosstalk",
+    "abstract": "Language is the principal tool for human communication, in which humor is one of the most attractive parts. Producing natural language like humans using computers, a.k.a, Natural Language Generation (NLG), has been widely used for dialogue systems, chatbots, machine translation, as well as computer-aid creation e.g., idea generations, scriptwriting. However, the humor aspect of natural language is relatively under-investigated, especially in the age of pre-trained language models. In this work, we aim to preliminarily test *whether NLG can generate humor as humans do*. We build a largest dataset consisting of numerous **C**hinese **C**omical **C**rosstalk scripts (called **C**3 in short), which is for a popular Chinese performing art called \u2018Xiangsheng\u2019 or \u2018\u76f8\u58f0\u2019 since 1800s.We benchmark various generation approaches including training-from-scratch Seq2seq, fine-tuned middle-scale PLMs, and large-scale PLMs (with and without fine-tuning). Moreover, we also conduct a human assessment, showing that 1) *large-scale pretraining largely improves crosstalk generation quality*; and 2) *even the scripts generated from the best PLM is far from what we expect*. We conclude humor generation could be largely improved using large-scaled PLMs, but it is still in its infancy. The data and benchmarking code are publicly available in [https://github.com/anonNo2/crosstalk-generation](https://github.com/anonNo2/crosstalk-generation).",
+    "authors": [
+      "Jianquan Li",
+      "XiangBo Wu",
+      "Xiaokang Liu",
+      "Qianqian Xie",
+      "Prayag Tiwari",
+      "Benyou Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.419",
+    "point2d": [
+      -25.693334579467773,
+      39.05634689331055
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 421,
+    "title": "Convergence and Diversity in the Control Hierarchy",
+    "abstract": "Weir has defined a hierarchy of language classes whose second member (L2) is generated by tree-adjoining grammars (TAG), linear indexed grammars (LIG), combinatory categorial grammars, and head grammars. The hierarchy is obtained using the mechanism of control, and L2 is obtained using a context-free grammar (CFG) whose derivations are controlled by another CFG. We adapt Weir\u2019s definition of a controllable CFG (called a labeled distinguished CFG) to give a definition of controllable pushdown automata (PDAs), called labeled distinguished PDAs. This yields three new characterizations of L2 as the class of languages generated by PDAs controlling PDAs, PDAs controlling CFGs, and CFGs controlling PDAs. We show that these four formalisms are not only weakly equivalent but equivalent in a stricter sense that we call d-weak equivalence. Furthermore, using an even stricter notion of equivalence called d-strong equivalence, we make precise the intuition that a CFG controlling a CFG is a TAG, a PDA controlling a PDA is an embedded PDA, and a PDA controlling a CFG is a LIG. The fourth member of this family, a CFG controlling a PDA, does not correspond to any kind of automaton we know of, so we invent one and call it a Pushdown Adjoining Automaton (PAA).",
+    "authors": [
+      "Alexandra Butoi",
+      "Ryan Cotterell",
+      "David Chiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.420",
+    "point2d": [
+      -20.524555206298828,
+      -60.839107513427734
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 422,
+    "title": "ConFEDE: Contrastive Feature Decomposition for Multimodal Sentiment Analysis",
+    "abstract": "Multimodal Sentiment Analysis aims to predict the sentiment of video content. Recent research suggests that multimodal sentiment analysis critically depends on learning a good representation of multimodal information, which should contain both modality-invariant representations that are consistent across modalities as well as modality-specific representations. In this paper, we propose ConFEDE, a unified learning framework that jointly performs contrastive representation learning and contrastive feature decomposition to enhance the representation of multimodal information. It decomposes each of the three modalities of a video sample, including text, video frames, and audio, into a similarity feature and a dissimilarity feature, which are learned by a contrastive relation centered around the text. We conducted extensive experiments on CH-SIMS, MOSI and MOSEI to evaluate various state-of-the-art multimodal sentiment analysis methods. Experimental results show that ConFEDE outperforms all baselines on these datasets on a range of metrics.",
+    "authors": [
+      "Jiuding Yang",
+      "Yakun Yu",
+      "Di Niu",
+      "Weidong Guo",
+      "Yu Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.421",
+    "point2d": [
+      -40.3193244934082,
+      58.36663055419922
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 423,
+    "title": "Using Domain Knowledge to Guide Dialog Structure Induction via Neural Probabilistic Soft Logic",
+    "abstract": "Dialog Structure Induction (DSI) is the task of inferring the latent dialog structure (i.e., a set of dialog states and their temporal transitions) of a given goal-oriented dialog. It is a critical component for modern dialog system design and discourse analysis. Existing DSI approaches are often purely data-driven, deploy models that infer latent states without access to domain knowledge, underperform when the training corpus is limited/noisy, or have difficulty when test dialogs exhibit distributional shifts from the training domain. This work explores a neural-symbolic approach as a potential solution to these problems. We introduce Neural Probabilistic Soft Logic Dialogue Structure Induction (NEUPSL DSI), a principled approach that injects symbolic knowledge into the latent space of a generative neural model. We conduct a thorough empirical investigation on the effect of NEUPSL DSI learning on hidden representation quality, few-shot learning, and out-of-domain generalization performance. Over three dialog structure induction datasets and across unsupervised and semi-supervised settings for standard and cross-domain generalization, the injection of symbolic knowledge using NEUPSL DSI provides a consistent boost in performance over the canonical baselines.",
+    "authors": [
+      "Connor Pryor",
+      "Quan Yuan",
+      "Jeremiah Liu",
+      "Mehran Kazemi",
+      "Deepak Ramachandran",
+      "Tania Bedrax-Weiss",
+      "Lise Getoor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.422",
+    "point2d": [
+      2.901885747909546,
+      68.83094024658203
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 424,
+    "title": "Are You Copying My Model? Protecting the Copyright of Large Language Models for EaaS via Backdoor Watermark",
+    "abstract": "Large language models (LLMs) have demonstrated powerful capabilities in both text understanding and generation. Companies have begun to offer Embedding as a Service (EaaS) based on these LLMs, which can benefit various natural language processing (NLP) tasks for customers. However, previous studies have shown that EaaS is vulnerable to model extraction attacks, which can cause significant losses for the owners of LLMs, as training these models is extremely expensive. To protect the copyright of LLMs for EaaS, we propose an Embedding Watermark method called {pasted macro \u2018METHOD\u2019} that implants backdoors on embeddings. Our method selects a group of moderate-frequency words from a general text corpus to form a trigger set, then selects a target embedding as the watermark, and inserts it into the embeddings of texts containing trigger words as the backdoor. The weight of insertion is proportional to the number of trigger words included in the text. This allows the watermark backdoor to be effectively transferred to EaaS-stealer\u2019s model for copyright verification while minimizing the adverse impact on the original embeddings\u2019 utility. Our extensive experiments on various datasets show that our method can effectively protect the copyright of EaaS models without compromising service quality.Our code is available at https://github.com/yjw1029/EmbMarker.",
+    "authors": [
+      "Wenjun Peng",
+      "Jingwei Yi",
+      "Fangzhao Wu",
+      "Shangxi Wu",
+      "Bin Bin Zhu",
+      "Lingjuan Lyu",
+      "Binxing Jiao",
+      "Tong Xu",
+      "Guangzhong Sun",
+      "Xing Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.423",
+    "point2d": [
+      -7.972565174102783,
+      13.068628311157227
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 425,
+    "title": "Answering Ambiguous Questions via Iterative Prompting",
+    "abstract": "In open-domain question answering, due to the ambiguity of questions, multiple plausible answers may exist.To provide feasible answers to an ambiguous question,one approach is to directly predict all valid answers, but this can struggle with balancing relevance and diversity.An alternative is to gather candidate answers and aggregate them, but this method can be computationally costly and may neglect dependencies among answers.In this paper, we present AmbigPrompt to address the imperfections of existing approaches to answering ambiguous questions.Specifically, we integrate an answering model with a prompting model in an iterative manner.The prompting model adaptively tracks the reading process and progressively triggers the answering model to compose distinct and relevant answers. Additionally, we develop a task-specific post-pretraining approach for both the answering model and the prompting model, which greatly improves the performance of our framework. Empirical studies on two commonly-used open benchmarks show that AmbigPrompt achieves state-of-the-art or competitive results while using less memory and having a lower inference latency than competing approaches. Additionally, AmbigPrompt also performs well in low-resource settings.",
+    "authors": [
+      "Weiwei Sun",
+      "Hengyi Cai",
+      "Hongshen Chen",
+      "Pengjie Ren",
+      "Zhumin Chen",
+      "Maarten de Rijke",
+      "Zhaochun Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.424",
+    "point2d": [
+      66.00701904296875,
+      6.424683570861816
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 426,
+    "title": "A Dataset of Argumentative Dialogues on Scientific Papers",
+    "abstract": "With recent advances in question-answering models, various datasets have been collected to improve and study the effectiveness of these models on scientific texts. Questions and answers in these datasets explore a scientific paper by seeking factual information from the paper\u2019s content. However, these datasets do not tackle the argumentative content of scientific papers, which is of huge importance in persuasiveness of a scientific discussion. We introduce ArgSciChat, a dataset of 41 argumentative dialogues between scientists on 20 NLP papers. The unique property of our dataset is that it includes both exploratory and argumentative questions and answers in a dialogue discourse on a scientific paper. Moreover, the size of ArgSciChat demonstrates the difficulties in collecting dialogues for specialized domains.Thus, our dataset is a challenging resource to evaluate dialogue agents in low-resource domains, in which collecting training data is costly. We annotate all sentences of dialogues in ArgSciChat and analyze them extensively. The results confirm that dialogues in ArgSciChat include exploratory and argumentative interactions. Furthermore, we use our dataset to fine-tune and evaluate a pre-trained document-grounded dialogue agent. The agent achieves a low performance on our dataset, motivating a need for dialogue agents with a capability to reason and argue about their answers. We publicly release ArgSciChat.",
+    "authors": [
+      "Federico Ruggeri",
+      "Mohsen Mesgar",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.425",
+    "point2d": [
+      41.85460662841797,
+      37.409481048583984
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 427,
+    "title": "Massively Multilingual Lexical Specialization of Multilingual Transformers",
+    "abstract": "While pretrained language models (PLMs) primarily serve as general-purpose text encoders that can be fine-tuned for a wide variety of downstream tasks, recent work has shown that they can also be rewired to produce high-quality word representations (i.e., static word embeddings) and yield good performance in type-level lexical tasks. While existing work primarily focused on the lexical specialization of monolingual PLMs with immense quantities of monolingual constraints, in this work we expose massively multilingual transformers (MMTs, e.g., mBERT or XLM-R) to multilingual lexical knowledge at scale, leveraging BabelNet as the readily available rich source of multilingual and cross-lingual type-level lexical knowledge. Concretely, we use BabelNet\u2019s multilingual synsets to create synonym pairs (or synonym-gloss pairs) across 50 languages and then subject the MMTs (mBERT and XLM-R) to a lexical specialization procedure guided by a contrastive objective. We show that such massively multilingual lexical specialization brings substantial gains in two standard cross-lingual lexical tasks, bilingual lexicon induction and cross-lingual word similarity, as well as in cross-lingual sentence retrieval. Crucially, we observe gains for languages unseen in specialization, indicating that multilingual lexical specialization enables generalization to languages with no lexical constraints. In a series of subsequent controlled experiments, we show that the number of specialization constraints plays a much greater role than the set of languages from which they originate.",
+    "authors": [
+      "Tommaso Green",
+      "Simone Paolo Ponzetto",
+      "Goran Glava\u0161"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.426",
+    "point2d": [
+      -33.09092712402344,
+      -27.383228302001953
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 428,
+    "title": "RL4F: Generating Natural Language Feedback with Reinforcement Learning for Repairing Model Outputs",
+    "abstract": "Despite their unprecedented success, even the largest language models make mistakes.Similar to how humans learn and improve using feedback, previous work proposed providing language models with natural language feedback to guide them in repairing their outputs. Because human-generated critiques are expensive to obtain, researchers have devised learned critique generators in lieu of human critics while assuming one can train downstream models to utilize generated feedback. However, this approach does not apply to black-box or limited access models such as ChatGPT, as they cannot be fine-tuned. Moreover, in the era of large general-purpose language agents, fine-tuning is neither computationally nor spatially efficient as it results in multiple copies of the network. In this work, we introduce RL4F (Reinforcement Learning for Feedback), a multi-agent collaborative framework where the critique generator is trained to maximize end-task performance of GPT-3, a fixed model more than 200 times its size. RL4F produces critiques that help GPT-3 revise its outputs. We study three datasets for action planning, summarization and alphabetization and show relative improvements up to 10% in multiple text similarity metrics over other learned, retrieval-augmented or prompting-based critique generators.",
+    "authors": [
+      "Afra Feyza Akyurek",
+      "Ekin Akyurek",
+      "Ashwin Kalyan",
+      "Peter Clark",
+      "Derry Tanti Wijaya",
+      "Niket Tandon"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.427",
+    "point2d": [
+      -2.2741997241973877,
+      46.79343795776367
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 429,
+    "title": "WebIE: Faithful and Robust Information Extraction on the Web",
+    "abstract": "Extracting structured and grounded fact triples from raw text is a fundamental task in Information Extraction (IE). Existing IE datasets are typically collected from Wikipedia articles, using hyperlinks to link entities to the Wikidata knowledge base. However, models trained only on Wikipedia have limitations when applied to web domains, which often contain noisy text or text that does not have any factual information. We present WebIE, the first large-scale, entity-linked closed IE dataset consisting of 1.6M sentences automatically collected from the English Common Crawl corpus. WebIE also includes negative examples, i.e. sentences without fact triples, to better reflect the data on the web. We annotate ~25K triples from WebIE through crowdsourcing and introduce mWebIE, a translation of the annotated set in four other languages: French, Spanish, Portuguese, and Hindi. We evaluate the in-domain, out-of-domain, and zero-shot cross-lingual performance of generative IE models and find models trained on WebIE show better generalisability. We also propose three training strategies that use entity linking as an auxiliary task. Our experiments show that adding Entity-Linking objectives improves the faithfulness of our generative IE models.",
+    "authors": [
+      "Chenxi Whitehouse",
+      "Clara Vania",
+      "Alham Fikri Aji",
+      "Christos Christodoulopoulos",
+      "Andrea Pierleoni"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.428",
+    "point2d": [
+      46.789710998535156,
+      -64.02669525146484
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 430,
+    "title": "NormBank: A Knowledge Bank of Situational Social Norms",
+    "abstract": "We present NormBank, a knowledge bank of 155k situational norms. This resource is designed to ground flexible normative reasoning for interactive, assistive, and collaborative AI systems. Unlike prior commonsense resources, NormBank grounds each inference within a multivalent sociocultural frame, which includes the setting (e.g., restaurant), the agents\u2019 contingent roles (waiter, customer), their attributes (age, gender), and other physical, social, and cultural constraints (e.g., the temperature or the country of operation). In total, NormBank contains 63k unique constraints from a taxonomy that we introduce and iteratively refine here. Constraints then apply in different combinations to frame social norms. Under these manipulations, norms are non-monotonic \u2014 one can cancel an inference by updating its frame even slightly. Still, we find evidence that neural models can help reliably extend the scope and coverage of NormBank. We further demonstrate the utility of this resource with a series of transfer experiments. For data and code, see https://github.com/SALT-NLP/normbank",
+    "authors": [
+      "Caleb Ziems",
+      "Jane Dwivedi-Yu",
+      "Yi-Chia Wang",
+      "Alon Halevy",
+      "Diyi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.429",
+    "point2d": [
+      48.512939453125,
+      -7.538443088531494
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 431,
+    "title": "DIP: Dead code Insertion based Black-box Attack for Programming Language Model",
+    "abstract": "Automatic processing of source code, such as code clone detection and software vulnerability detection, is very helpful to software engineers. Large pre-trained Programming Language (PL) models (such as CodeBERT, GraphCodeBERT, CodeT5, etc.), show very powerful performance on these tasks. However, these PL models are vulnerable to adversarial examples that are generated with slight perturbation. Unlike natural language, an adversarial example of code must be semantic-preserving and compilable. Due to the requirements, it is hard to directly apply the existing attack methods for natural language models.In this paper, we propose DIP (Dead code Insertion based Black-box Attack for Programming Language Model), a high-performance and effective black-box attack method to generate adversarial examples using dead code insertion. We evaluate our proposed method on 9 victim downstream-task large code models. Our method outperforms the state-of-the-art black-box attack in both attack efficiency and attack quality, while generated adversarial examples are compiled preserving semantic functionality.",
+    "authors": [
+      "CheolWon Na",
+      "YunSeok Choi",
+      "Jee-Hyong Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.430",
+    "point2d": [
+      -6.184549331665039,
+      -58.462074279785156
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 432,
+    "title": "Modeling Structural Similarities between Documents for Coherence Assessment with Graph Convolutional Networks",
+    "abstract": "Coherence is an important aspect of text quality, and various approaches have been applied to coherence modeling. However, existing methods solely focus on a single document\u2019s coherence patterns, ignoring the underlying correlation between documents. We investigate a GCN-based coherence model that is capable of capturing structural similarities between documents. Our model first creates a graph structure for each document, from where we mine different subgraph patterns. We then construct a heterogeneous graph for the training corpus, connecting documents based on their shared subgraphs. Finally, a GCN is applied to the heterogeneous graph to model the connectivity relationships. We evaluate our method on two tasks, assessing discourse coherence and automated essay scoring. Results show that our GCN-based model outperforms all baselines, achieving a new state-of-the-art on both tasks.",
+    "authors": [
+      "Wei Liu",
+      "Xiyan Fu",
+      "Michael Strube"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.431",
+    "point2d": [
+      -1.374474287033081,
+      42.61122512817383
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 433,
+    "title": "HiTIN: Hierarchy-aware Tree Isomorphism Network for Hierarchical Text Classification",
+    "abstract": "Hierarchical text classification (HTC) is a challenging subtask of multi-label classification as the labels form a complex hierarchical structure. Existing dual-encoder methods in HTC achieve weak performance gains with huge memory overheads and their structure encoders heavily rely on domain knowledge. Under such observation, we tend to investigate the feasibility of a memory-friendly model with strong generalization capability that could boost the performance of HTC without prior statistics or label semantics. In this paper, we propose Hierarchy-aware Tree Isomorphism Network (HiTIN) to enhance the text representations with only syntactic information of the label hierarchy. Specifically, we convert the label hierarchy into an unweighted tree structure, termed coding tree, with the guidance of structural entropy. Then we design a structure encoder to incorporate hierarchy-aware information in the coding tree into text representations. Besides the text encoder, HiTIN only contains a few multi-layer perceptions and linear transformations, which greatly saves memory. We conduct experiments on three commonly used datasets and the results demonstrate that HiTIN could achieve better test performance and less memory consumption than state-of-the-art (SOTA) methods.",
+    "authors": [
+      "He Zhu",
+      "Chong Zhang",
+      "Junjie Huang",
+      "Junran Wu",
+      "Ke Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.432",
+    "point2d": [
+      1.5739444494247437,
+      -23.18395233154297
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 434,
+    "title": "Contextual Knowledge Learning for Dialogue Generation",
+    "abstract": "Incorporating conversational context and knowledge into dialogue generation models has been essential for improving the quality of the generated responses. The context, comprising utterances from previous dialogue exchanges, is used as a source of content for response generation and as a means of selecting external knowledge. However, to avoid introducing irrelevant content, it is key to enable fine-grained scoring of context and knowledge. In this paper, we present a novel approach to context and knowledge weighting as an integral part of model training.We guide the model training through a Contextual Knowledge Learning (CKL) process which involves Latent Vectors for context and knowledge, respectively. CKL Latent Vectors capture the relationship between context, knowledge, and responses through weak supervision and enable differential weighting of context utterances and knowledge sentences during the training process. Experiments with two standard datasets and human evaluation demonstrate that CKL leads to a significant improvement compared with the performance of six strong baseline models and shows robustness with regard to reduced sizes of training sets.",
+    "authors": [
+      "Wen Zheng",
+      "Natasa Milic-Frayling",
+      "Ke Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.433",
+    "point2d": [
+      14.378194808959961,
+      59.11611557006836
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 435,
+    "title": "Easy Guided Decoding in Providing Suggestions for Interactive Machine Translation",
+    "abstract": "Machine translation technology has made great progress in recent years, but it cannot guarantee error-free results. Human translators perform post-editing on machine translations to correct errors in the scene of computer aided translation. In favor of expediting the post-editing process, many works have investigated machine translation in interactive modes, in which machines can automatically refine the rest of translations constrained by human\u2019s edits. Translation Suggestion (TS), as an interactive mode to assist human translators, requires machines to generate alternatives for specific incorrect words or phrases selected by human translators. In this paper, we utilize the parameterized objective function of neural machine translation (NMT) and propose a novel constrained decoding algorithm, namely Prefix-Suffix Guided Decoding (PSGD), to deal with the TS problem without additional training. Compared to state-of-the-art lexical-constrained decoding method, PSGD improves translation quality by an average of 10.6 BLEU and reduces time overhead by an average of 63.4% on benchmark datasets. Furthermore, on both the WeTS and the WMT 2022 Translation Suggestion datasets, it is superior over other supervised learning systems trained with TS annotated data.",
+    "authors": [
+      "Ke Wang",
+      "Xin Ge",
+      "Jiayi Wang",
+      "Yuqi Zhang",
+      "Yu Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.434",
+    "point2d": [
+      -62.30316925048828,
+      -4.834588050842285
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 436,
+    "title": "Discourse-Centric Evaluation of Document-level Machine Translation with a New Densely Annotated Parallel Corpus of Novels",
+    "abstract": "Several recent papers claim to have achieved human parity at sentence-level machine translation (MT)\u2014especially between high-resource language pairs. In response, the MT community has, in part, shifted its focus to document-level translation. Translating documents requires a deeper understanding of the structure and meaning of text, which is often captured by various kinds of discourse phenomena such as consistency, coherence, and cohesion. However, this renders conventional sentence-level MT evaluation benchmarks inadequate for evaluating the performance of context-aware MT systems. This paperpresents a new dataset with rich discourse annotations, built upon the large-scale parallel corpus BWB introduced in Jiang et al. (2022a). The new BWB annotation introduces four extra evaluation aspects, i.e., entity, terminology, coreference, and quotation, covering 15,095 entity mentions in both languages. Using these annotations, we systematically investigate the similarities and differences between the discourse structures of source and target languages, and the challenges they pose to MT. We discover that MT outputs differ fundamentally from human translations in terms of their latent discourse structures. This gives us a new perspective on the challenges and opportunities in document-level MT. We make our resource publicly available to spur future research in document-level MT and its generalization to other language translation tasks.",
+    "authors": [
+      "Yuchen Eleanor Jiang",
+      "Tianyu Liu",
+      "Shuming Ma",
+      "Dongdong Zhang",
+      "Mrinmaya Sachan",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.435",
+    "point2d": [
+      -67.96273040771484,
+      3.001760482788086
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 437,
+    "title": "CMOT: Cross-modal Mixup via Optimal Transport for Speech Translation",
+    "abstract": "End-to-end speech translation (ST) is the task of translating speech signals in the source language into text in the target language. As a cross-modal task, end-to-end ST is difficult to train with limited data. Existing methods often try to transfer knowledge from machine translation (MT), but their performances are restricted by the modality gap between speech and text. In this paper, we propose Cross-modal Mixup via Optimal Transport (CMOT) to overcome the modality gap. We find the alignment between speech and text sequences via optimal transport and then mix up the sequences from different modalities at a token level using the alignment. Experiments on the MuST-C ST benchmark demonstrate that CMOT achieves an average BLEU of 30.0 in 8 translation directions, outperforming previous methods. Further analysis shows CMOT can adaptively find the alignment between modalities, which helps alleviate the modality gap between speech and text.",
+    "authors": [
+      "Yan Zhou",
+      "Qingkai Fang",
+      "Yang Feng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.436",
+    "point2d": [
+      -68.53536987304688,
+      22.010282516479492
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 438,
+    "title": "On the Evaluation of Neural Selective Prediction Methods for Natural Language Processing",
+    "abstract": "We provide a survey and empirical comparison of the state-of-the-art in neural selective classification for NLP tasks. We also provide a methodological blueprint, including a novel metric called refinement that provides a calibrated evaluation of confidence functions for selective prediction. Finally, we supply documented, open-source code to support the future development of selective prediction techniques.",
+    "authors": [
+      "Zhengyao Gu",
+      "Mark Hopkins"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.437",
+    "point2d": [
+      -38.64419174194336,
+      -34.62229537963867
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 439,
+    "title": "Speech-Text Pre-training for Spoken Dialog Understanding with Explicit Cross-Modal Alignment",
+    "abstract": "Recently, speech-text pre-training methods have shown remarkable success in many speech and natural language processing tasks. However, most previous pre-trained models are usually tailored for one or two specific tasks, but fail to conquer a wide range of speech-text tasks. In addition, existing speech-text pre-training methods fail to explore the contextual information within a dialogue to enrich utterance representations. In this paper, we propose Speech-text Pre-training for spoken dialog understanding with ExpliCiT cRoss-Modal Alignment (SPECTRA), which is the first-ever speech-text dialog pre-training model. Concretely, to consider the temporality of speech modality, we design a novel temporal position prediction task to capture the speech-text alignment. This pre-training task aims to predict the start and end time of each textual word in the corresponding speech waveform. In addition, to learn the characteristics of spoken dialogs, we generalize a response selection task from textual dialog pre-training to speech-text dialog pre-training scenarios. Experimental results on four different downstream speech-text tasks demonstrate the superiority of SPECTRA in learning speech-text alignment and multi-turn dialog context.",
+    "authors": [
+      "Tianshu Yu",
+      "Haoyu Gao",
+      "Ting-En Lin",
+      "Min Yang",
+      "Yuchuan Wu",
+      "Wentao Ma",
+      "Chao Wang",
+      "Fei Huang",
+      "Yongbin Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.438",
+    "point2d": [
+      -0.6868422627449036,
+      65.2951889038086
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 440,
+    "title": "Text Style Transfer with Contrastive Transfer Pattern Mining",
+    "abstract": "Text style transfer (TST) is an important task in natural language generation, which aims to alter the stylistic attributes (e.g., sentiment) of a sentence and keep its semantic meaning unchanged. Most existing studies mainly focus on the transformation between styles, yet ignore that this transformation can be actually carried out via different hidden transfer patterns. To address this problem, we propose a novel approach, contrastive transfer pattern mining (CTPM), which automatically mines and utilizes inherent latent transfer patterns to improve the performance of TST. Specifically, we design an adaptive clustering module to automatically discover hidden transfer patterns from the data, and introduce contrastive learning based on the discovered patterns to obtain more accurate sentence representations, and thereby benefit the TST task. To the best of our knowledge, this is the first work that proposes the concept of transfer patterns in TST, and our approach can be applied in a plug-and-play manner to enhance other TST methods to further improve their performance. Extensive experiments on benchmark datasets verify the effectiveness and generality of our approach.",
+    "authors": [
+      "Jingxuan Han",
+      "Quan Wang",
+      "Licheng Zhang",
+      "Weidong Chen",
+      "Yan Song",
+      "Zhendong Mao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.439",
+    "point2d": [
+      -22.077861785888672,
+      43.27910614013672
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 441,
+    "title": "Zero- and Few-Shot Event Detection via Prompt-Based Meta Learning",
+    "abstract": "With emerging online topics as a source for numerous new events, detecting unseen / rare event types presents an elusive challenge for existing event detection methods, where only limited data access is provided for training. To address the data scarcity problem in event detection, we propose MetaEvent, a meta learning-based framework for zero- and few-shot event detection. Specifically, we sample training tasks from existing event types and perform meta training to search for optimal parameters that quickly adapt to unseen tasks. In our framework, we propose to use the cloze-based prompt and a trigger-aware soft verbalizer to efficiently project output to unseen event types. Moreover, we design a contrastive meta objective based on maximum mean discrepancy (MMD) to learn class-separating features. As such, the proposed MetaEvent can perform zero-shot event detection by mapping features to event types without any prior knowledge. In our experiments, we demonstrate the effectiveness of MetaEvent in both zero-shot and few-shot scenarios, where the proposed method achieves state-of-the-art performance in extensive experiments on benchmark datasets FewEvent and MAVEN.",
+    "authors": [
+      "Zhenrui Yue",
+      "Huimin Zeng",
+      "Mengfei Lan",
+      "Heng Ji",
+      "Dong Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.440",
+    "point2d": [
+      51.989139556884766,
+      -36.79521560668945
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 442,
+    "title": "Text Style Transfer Back-Translation",
+    "abstract": "Back Translation (BT) is widely used in the field of machine translation, as it has been proved effective for enhancing translation quality. However, BT mainly improves the translation of inputs that share a similar style (to be more specific, translation-liked inputs), since the source side of BT data is machine-translated. For natural inputs, BT brings only slight improvements and sometimes even adverse effects. To address this issue, we propose Text Style Transfer Back Translation (TST BT), which uses a style transfer to modify the source side of BT data. By making the style of source-side text more natural, we aim to improve the translation of natural inputs. Our experiments on various language pairs, including both high-resource and low-resource ones, demonstrate that TST BT significantly improves translation performance against popular BT benchmarks. In addition, TST BT is proved to be effective in domain adaptation so this strategy can be regarded as a generalized data augmentation method. Our training code and text style transfer model are open-sourced.",
+    "authors": [
+      "Daimeng Wei",
+      "Zhanglin Wu",
+      "Hengchao Shang",
+      "Zongyao Li",
+      "Minghan Wang",
+      "Jiaxin Guo",
+      "Xiaoyu Chen",
+      "Zhengzhe Yu",
+      "Hao Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.441",
+    "point2d": [
+      -66.22592163085938,
+      -9.08630084991455
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 443,
+    "title": "Generating Visual Spatial Description via Holistic 3D Scene Understanding",
+    "abstract": "Visual spatial description (VSD) aims to generate texts that describe the spatial relations of the given objects within images. Existing VSD work merely models the 2D geometrical vision features, thus inevitably falling prey to the problem of skewed spatial understanding of target objects. In this work, we investigate the incorporation of 3D scene features for VSD. With an external 3D scene extractor, we obtain the 3D objects and scene features for input images, based on which we construct a target object-centered 3D spatial scene graph (Go3D-S2G), such that we model the spatial semantics of target objects within the holistic 3D scenes. Besides, we propose a scene subgraph selecting mechanism, sampling topologically-diverse subgraphs from Go3D-S2G, where the diverse local structure features are navigated to yield spatially-diversified text generation. Experimental results on two VSD datasets demonstrate that our framework outperforms the baselines significantly, especially improving on the cases with complex visual spatial relations. Meanwhile, our method can produce more spatially-diversified generation.",
+    "authors": [
+      "Yu Zhao",
+      "Hao Fei",
+      "Wei Ji",
+      "Jianguo Wei",
+      "Meishan Zhang",
+      "Min Zhang",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.442",
+    "point2d": [
+      -52.92557907104492,
+      46.575443267822266
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 444,
+    "title": "Continual Knowledge Distillation for Neural Machine Translation",
+    "abstract": "While many parallel corpora are not publicly accessible for data copyright, data privacy and competitive differentiation reasons, trained translation models are increasingly available on open platforms. In this work, we propose a method called continual knowledge distillation to take advantage of existing translation models to improve one model of interest. The basic idea is to sequentially transfer knowledge from each trained model to the distilled model. Extensive experiments on Chinese-English and German-English datasets show that our method achieves significant and consistent improvements over strong baselines under both homogeneous and heterogeneous trained model settings and is robust to malicious models.",
+    "authors": [
+      "Yuanchi Zhang",
+      "Peng Li",
+      "Maosong Sun",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.443",
+    "point2d": [
+      -62.95326614379883,
+      -12.22775936126709
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 445,
+    "title": "Query Refinement Prompts for Closed-Book Long-Form QA",
+    "abstract": "Large language models (LLMs) have been shown to perform well in answering questions and in producing long-form texts, both in few-shot closed-book settings. While the former can be validated using well-known evaluation metrics, the latter is difficult to evaluate. We resolve the difficulties to evaluate long-form output by doing both tasks at once \u2013 to do question answering that requires long-form answers. Such questions tend to be multifaceted, i.e., they may have ambiguities and/or require information from multiple sources. To this end, we define query refinement prompts that encourage LLMs to explicitly express the multifacetedness in questions and generate long-form answers covering multiple facets of the question. Our experiments on two long-form question answering datasets, ASQA and AQuAMuSe, show that using our prompts allows us to outperform fully finetuned models in the closed book setting, as well as achieve results comparable to retrieve-then-generate open-book models.",
+    "authors": [
+      "Reinald Kim Amplayo",
+      "Kellie Webster",
+      "Michael Collins",
+      "Dipanjan Das",
+      "Shashi Narayan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.444",
+    "point2d": [
+      65.39994812011719,
+      10.351058959960938
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 446,
+    "title": "CONE: An Efficient COarse-to-fiNE Alignment Framework for Long Video Temporal Grounding",
+    "abstract": "This paper tackles an emerging and challenging problem of long video temporal grounding (VTG) that localizes video moments related to a natural language (NL) query. Compared with short videos, long videos are also highly demanded but less explored, which brings new challenges in higher inference computation cost and weaker multi-modal alignment. To address these challenges, we propose CONE, an efficient COarse-to-fiNE alignment framework. CONE is a plug-and-play framework on top of existing VTG models to handle long videos through a sliding window mechanism. Specifically, CONE (1) introduces a query-guided window selection strategy to speed up inference, and (2) proposes a coarse-to-fine mechanism via a novel incorporation of contrastive learning to enhance multi-modal alignment for long videos. Extensive experiments on two large-scale long VTG benchmarks consistently show both substantial performance gains (e.g., from 3.13 to 6.87% on MAD) and state-of-the-art results. Analyses also reveal higher efficiency as the query-guided window selection mechanism accelerates inference time by 2x on Ego4D-NLQ and 15x on MAD while keeping SOTA results. Codes have been released at https://github.com/houzhijian/CONE.",
+    "authors": [
+      "Zhijian Hou",
+      "Wanjun Zhong",
+      "Lei Ji",
+      "Difei Gao",
+      "Kun Yan",
+      "W.k. Chan",
+      "Chong-Wah Ngo",
+      "Mike Zheng Shou",
+      "Nan Duan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.445",
+    "point2d": [
+      -60.213584899902344,
+      59.703407287597656
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 447,
+    "title": "Few-Shot Document-Level Event Argument Extraction",
+    "abstract": "Event argument extraction (EAE) has been well studied at the sentence level but under-explored at the document level. In this paper, we study to capture event arguments that actually spread across sentences in documents. Prior works usually assume full access to rich document supervision, ignoring the fact that the available argument annotation is limited in production.To fill this gap, we present FewDocAE, a Few-Shot Document-Level Event Argument Extraction benchmark, based on the existing document-level event extraction dataset. We first define the new problem and reconstruct the corpus by a novel N-Way-D-Doc sampling instead of the traditional N-Way-K-Shot strategy. Then we adjust the current document-level neural models into the few-shot setting to provide baseline results under in- and cross-domain settings. Since the argument extraction depends on the context from multiple sentences and the learning process is limited to very few examples, we find this novel task to be very challenging with substantively low performance. Considering FewDocAE is closely related to practical use under low-resource regimes, we hope this benchmark encourages more research in this direction. Our data and codes will be available online.",
+    "authors": [
+      "Xianjun Yang",
+      "Yujie Lu",
+      "Linda Petzold"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.446",
+    "point2d": [
+      44.78990173339844,
+      -43.48051071166992
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 448,
+    "title": "ParaAMR: A Large-Scale Syntactically Diverse Paraphrase Dataset by AMR Back-Translation",
+    "abstract": "Paraphrase generation is a long-standing task in natural language processing (NLP). Supervised paraphrase generation models, which rely on human-annotated paraphrase pairs, are cost-inefficient and hard to scale up. On the other hand, automatically annotated paraphrase pairs (e.g., by machine back-translation), usually suffer from the lack of syntactic diversity \u2013 the generated paraphrase sentences are very similar to the source sentences in terms of syntax. In this work, we present ParaAMR, a large-scale syntactically diverse paraphrase dataset created by abstract meaning representation back-translation. Our quantitative analysis, qualitative examples, and human evaluation demonstrate that the paraphrases of ParaAMR are syntactically more diverse compared to existing large-scale paraphrase datasets while preserving good semantic similarity. In addition, we show that ParaAMR can be used to improve on three NLP tasks: learning sentence embeddings, syntactically controlled paraphrase generation, and data augmentation for few-shot learning. Our results thus showcase the potential of ParaAMR for improving various NLP applications.",
+    "authors": [
+      "Kuan-Hao Huang",
+      "Varun Iyer",
+      "I-Hung Hsu",
+      "Anoop Kumar",
+      "Kai-Wei Chang",
+      "Aram Galstyan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.447",
+    "point2d": [
+      -10.885784149169922,
+      4.9826130867004395
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 449,
+    "title": "Towards Understanding and Improving Knowledge Distillation for Neural Machine Translation",
+    "abstract": "Knowledge distillation (KD) is a promising technique for model compression in neural machine translation. However, where the knowledge hides in KD is still not clear, which may hinder the development of KD. In this work, we first unravel this mystery from an empirical perspective and show that the knowledge comes from the top-1 predictions of teachers, which also helps us build a potential connection between word- and sequence-level KD. Further, we point out two inherent issues in vanilla word-level KD based on this finding. Firstly, the current objective of KD spreads its focus to whole distributions to learn the knowledge, yet lacks special treatment on the most crucial top-1 information. Secondly, the knowledge is largely covered by the golden information due to the fact that most top-1 predictions of teachers overlap with ground-truth tokens, which further restricts the potential of KD. To address these issues, we propose a new method named Top-1 Information Enhanced Knowledge Distillation (TIE-KD). Specifically, we design a hierarchical ranking loss to enforce the learning of the top-1 information from the teacher. Additionally, we develop an iterative KD procedure to infuse more additional knowledge by distilling on the data without ground-truth targets. Experiments on WMT\u201914 English-German, WMT\u201914 English-French and WMT\u201916 English-Romanian demonstrate that our method can respectively boost Transformer_{base} students by +1.04, +0.60 and +1.11 BLEU scores and significantly outperforms the vanilla word-level KD baseline. Besides, our method shows higher generalizability on different teacher-student capacity gaps than existing KD techniques.",
+    "authors": [
+      "Songming Zhang",
+      "Yunlong Liang",
+      "Shuaibo Wang",
+      "Yufeng Chen",
+      "Wenjuan Han",
+      "Jian Liu",
+      "Jinan Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.448",
+    "point2d": [
+      -50.73259353637695,
+      -19.970317840576172
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 450,
+    "title": "Multi-Row, Multi-Span Distant Supervision For Table+Text Question Answering",
+    "abstract": "Question answering (QA) over tables and linked text, also called TextTableQA, has witnessed significant research in recent years, as tables are often found embedded in documents along with related text. HybridQA and OTT-QA are the two best-known TextTableQA datasets, with questions that are best answered by combining information from both table cells and linked text passages. A common challenge in both datasets, and TextTableQA in general, is that the training instances include just the question and answer, where the gold answer may match not only multiple table cells across table rows but also multiple text spans within the scope of a table row and its associated text. This leads to a noisy multi-instance training regime. We present MITQA, a transformer-based TextTableQA system that is explicitly designed to cope with distant supervision along both these axes, through a multi-instance loss objective, together with careful curriculum design. Our experiments show that the proposed multi-instance distant supervision approach helps MITQA get sate-of-the-art results beating the existing baselines for both HybridQA and OTT-QA, putting MITQA at the top of HybridQA leaderboard with best EM and F1 scores on a held out test set.",
+    "authors": [
+      "Vishwajeet Kumar",
+      "Yash Gupta",
+      "Saneem Chemmengath",
+      "Jaydeep Sen",
+      "Soumen Chakrabarti",
+      "Samarth Bharadwaj",
+      "Feifei Pan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.449",
+    "point2d": [
+      76.91914367675781,
+      8.847046852111816
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 451,
+    "title": "HAHE: Hierarchical Attention for Hyper-Relational Knowledge Graphs in Global and Local Level",
+    "abstract": "Link Prediction on Hyper-relational Knowledge Graphs (HKG) is a worthwhile endeavor. HKG consists of hyper-relational facts (H-Facts), composed of a main triple and several auxiliary attribute-value qualifiers, which can effectively represent factually comprehensive information. The internal structure of HKG can be represented as a hypergraph-based representation globally and a semantic sequence-based representation locally. However, existing research seldom simultaneously models the graphical and sequential structure of HKGs, limiting HKGs\u2019 representation. To overcome this limitation, we propose a novel Hierarchical Attention model for HKG Embedding (HAHE), including global-level and local-level attention. The global-level attention can model the graphical structure of HKG using hypergraph dual-attention layers, while the local-level attention can learn the sequential structure inside H-Facts via heterogeneous self-attention layers. Experiment results indicate that HAHE achieves state-of-the-art performance in link prediction tasks on HKG standard datasets. In addition, HAHE addresses the issue of HKG multi-position prediction for the first time, increasing the applicability of the HKG link prediction task. Our code is publicly available.",
+    "authors": [
+      "Haoran Luo",
+      "Haihong E",
+      "Yuhao Yang",
+      "Yikai Guo",
+      "Mingzhi Sun",
+      "Tianyu Yao",
+      "Zichen Tang",
+      "Kaiyang Wan",
+      "Meina Song",
+      "Wei Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.450",
+    "point2d": [
+      55.214378356933594,
+      -63.21278381347656
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 452,
+    "title": "ORGAN: Observation-Guided Radiology Report Generation via Tree Reasoning",
+    "abstract": "This paper explores the task of radiology report generation, which aims at generating free-text descriptions for a set of radiographs. One significant challenge of this task is how to correctly maintain the consistency between the images and the lengthy report. Previous research explored solving this issue through planning-based methods, which generate reports only based on high-level plans. However, these plans usually only contain the major observations from the radiographs (e.g., lung opacity), lacking much necessary information, such as the observation characteristics and preliminary clinical diagnoses. To address this problem, the system should also take the image information into account together with the textual plan and perform stronger reasoning during the generation process. In this paper, we propose an Observation-guided radiology Report Generation framework (ORGan). It first produces an observation plan and then feeds both the plan and radiographs for report generation, where an observation graph and a tree reasoning mechanism are adopted to precisely enrich the plan information by capturing the multi-formats of each observation. Experimental results demonstrate that our framework outperforms previous state-of-the-art methods regarding text quality and clinical efficacy.",
+    "authors": [
+      "Wenjun Hou",
+      "Kaishuai Xu",
+      "Yi Cheng",
+      "Wenjie Li",
+      "Jiang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.451",
+    "point2d": [
+      34.114933013916016,
+      -33.01219940185547
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 453,
+    "title": "Data Curation Alone Can Stabilize In-context Learning",
+    "abstract": "In-context learning (ICL) enables large language models (LLMs) to perform new tasks by prompting them with a sequence of training examples. However, it is known that ICL is very sensitive to the choice of training examples: randomly sampling examples from a training set leads to high variance in performance. In this paper, we show that carefully curating a subset of training data greatly stabilizes ICL performance without any other changes to the ICL algorithm (e.g., prompt retrieval or calibration). We introduce two methods to choose training subsets\u2014both score training examples individually, then select the highest-scoring ones. CondAcc scores a training example by its average dev-set ICL accuracy when combined with random training examples, while Datamodels learns linear regressors that estimate how the presence of each training example influences LLM outputs. Across five tasks and two LLMs, sampling from stable subsets selected by CondAcc and Datamodels improves average accuracy over sampling from the entire training set by 7.7% and 6.3%, respectively.Surprisingly, the stable subset examples are not especially diverse in content or low in perplexity, in contrast with other work suggesting that diversity and perplexity are important when prompting LLMs.",
+    "authors": [
+      "Ting-Yun Chang",
+      "Robin Jia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.452",
+    "point2d": [
+      -13.495237350463867,
+      -21.658510208129883
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 454,
+    "title": "MidMed: Towards Mixed-Type Dialogues for Medical Consultation",
+    "abstract": "Most medical dialogue systems assume that patients have clear goals (seeking a diagnosis, medicine querying, etc.) before medical consultation. However, in many real situations, due to the lack of medical knowledge, it is usually difficult for patients to determine clear goals with all necessary slots. In this paper, we identify this challenge as how to construct medical consultation dialogue systems to help patients clarify their goals. For further study, we create a novel human-to-human mixed-type medical consultation dialogue corpus, termed MidMed, covering four dialogue types: task-oriented dialogue for diagnosis, recommendation, QA, and chitchat. MidMed covers four departments (otorhinolaryngology, ophthalmology, skin, and digestive system), with 8,309 dialogues. Furthermore, we build benchmarking baselines on MidMed and propose an instruction-guiding medical dialogue generation framework, termed InsMed, to handle mixed-type dialogues. Experimental results show the effectiveness of InsMed.",
+    "authors": [
+      "Xiaoming Shi",
+      "Zeming Liu",
+      "Chuan Wang",
+      "Haitao Leng",
+      "Kui Xue",
+      "Xiaofan Zhang",
+      "Shaoting Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.453",
+    "point2d": [
+      24.502460479736328,
+      69.49446868896484
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 455,
+    "title": "FiD-ICL: A Fusion-in-Decoder Approach for Efficient In-Context Learning",
+    "abstract": "Large pre-trained models are capable of few-shot in-context learning (ICL), i.e., performing a new task by prepending a few demonstrations before the test input. However, the concatenated demonstrations are often excessively long and induce additional computation. Inspired by fusion-in-decoder (FiD) models which efficiently aggregate more passages and thus outperforms concatenation-based models in open-domain QA, we hypothesize that similar techniques can be applied to improve the efficiency and end-task performance of ICL. To verify this, we present a comprehensive study on applying three fusion methods\u2014concatenation-based (early fusion), FiD (intermediate), and ensemble-based (late)\u2014to ICL. We adopt a meta-learning setup where a model is first trained to perform ICL on a mixture of tasks using one selected fusion method, then evaluated on held-out tasks for ICL. Results on 11 held-out tasks show that FiD-ICL matches or outperforms the other two fusion methods. Additionally, we show that FiD-ICL (1) is 10x faster at inference time compared to concat-based and ensemble-based ICL, as we can easily pre-compute the representations of in-context examples and reuse them; (2) enables scaling up to meta-training 3B-sized models, which would fail for concat-based ICL.",
+    "authors": [
+      "Qinyuan Ye",
+      "Iz Beltagy",
+      "Matthew Peters",
+      "Xiang Ren",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.454",
+    "point2d": [
+      -14.175980567932129,
+      -16.92344093322754
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 456,
+    "title": "S2ynRE: Two-stage Self-training with Synthetic data for Low-resource Relation Extraction",
+    "abstract": "Current relation extraction methods suffer from the inadequacy of large-scale annotated data.While distant supervision alleviates the problem of data quantities, there still exists domain disparity in data qualities due to its reliance on domain-restrained knowledge bases. In this work, we propose S2ynRE, a framework of two-stage Self-training with Synthetic data for Relation Extraction.We first leverage the capability of large language models to adapt to the target domain and automatically synthesize large quantities of coherent, realistic training data.We then propose an accompanied two-stage self-training algorithm that iteratively and alternately learns from synthetic and golden data together.We conduct comprehensive experiments and detailed ablations on popular relation extraction datasets to demonstrate the effectiveness of the proposed framework.",
+    "authors": [
+      "Benfeng Xu",
+      "Quan Wang",
+      "Yajuan Lyu",
+      "Dai Dai",
+      "Yongdong Zhang",
+      "Zhendong Mao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.455",
+    "point2d": [
+      40.40568542480469,
+      -63.11970520019531
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 457,
+    "title": "DSEE: Dually Sparsity-embedded Efficient Tuning of Pre-trained Language Models",
+    "abstract": "Gigantic pre-trained models have become central to natural language processing (NLP), serving as the starting point for fine-tuning towards a range of downstream tasks. However, two pain points persist for this paradigm: (a) as the pre-trained models grow bigger (e.g., 175B parameters for GPT-3), even the fine-tuning process can be time-consuming and computationally expensive; (b) the fine-tuned model has the same size as its starting point by default, which is neither sensible due to its more specialized functionality, nor practical since many fine-tuned models will be deployed in resource-constrained environments. To address these pain points, we propose a framework for resource- and parameter-efficient fine-tuning by leveraging the sparsity prior in both weight updates and the final model weights. Our proposed framework, dubbed Dually Sparsity-Embedded Efficient Tuning (DSEE), aims to achieve two key objectives: (i) parameter efficient fine-tuning - by enforcing sparsity-aware low-rank updates on top of the pre-trained weights; and (ii) resource-efficient inference - by encouraging a sparse weight structure towards the final fine-tuned model. We leverage sparsity in these two directions by exploiting both unstructured and structured sparse patterns in pre-trained language models viaa unified approach. Extensive experiments and in-depth investigations, with diverse network backbones (i.e., BERT, RoBERTa, and GPT-2) on dozens of datasets, consistently demonstrate impressive parameter-/inference-efficiency, while maintaining competitive downstream performance. For instance, DSEE saves about 25% inference FLOPs while achieving comparable performance, with 0.5% trainable parameters on BERT. Codes are available at https://github.com/VITA-Group/DSEE.",
+    "authors": [
+      "Xuxi Chen",
+      "Tianlong Chen",
+      "Weizhu Chen",
+      "Ahmed Hassan Awadallah",
+      "Zhangyang Wang",
+      "Yu Cheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.456",
+    "point2d": [
+      -36.72536087036133,
+      -18.99895477294922
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 458,
+    "title": "CASE: Aligning Coarse-to-Fine Cognition and Affection for Empathetic Response Generation",
+    "abstract": "Empathetic conversation is psychologically supposed to be the result of conscious alignment and interaction between the cognition and affection of empathy. However, existing empathetic dialogue models usually consider only the affective aspect or treat cognition and affection in isolation, which limits the capability of empathetic response generation. In this work, we propose the CASE model for empathetic dialogue generation. It first builds upon a commonsense cognition graph and an emotional concept graph and then aligns the user\u2019s cognition and affection at both the coarse-grained and fine-grained levels. Through automatic and manual evaluation, we demonstrate that CASE outperforms state-of-the-art baselines of empathetic dialogues and can generate more empathetic and informative responses.",
+    "authors": [
+      "Jinfeng Zhou",
+      "Chujie Zheng",
+      "Bo Wang",
+      "Zheng Zhang",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.457",
+    "point2d": [
+      25.895959854125977,
+      64.03067016601562
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 459,
+    "title": "Comparative evaluation of boundary-relaxed annotation for Entity Linking performance",
+    "abstract": "Entity Linking performance has a strong reliance on having a large quantity of high-quality annotated training data available. Yet, manual annotation of named entities, especially their boundaries, is ambiguous, error-prone, and raises many inconsistencies between annotators. While imprecise boundary annotation can degrade a model\u2019s performance, there are applications where accurate extraction of entities\u2019 surface form is not necessary. For those cases, a lenient annotation guideline could relieve the annotators\u2019 workload and speed up the process. This paper presents a case study designed to verify the feasibility of such annotation process and evaluate the impact of boundary-relaxed annotation in an Entity Linking pipeline. We first generate a set of noisy versions of the widely used AIDA CoNLL-YAGO dataset by expanding the boundaries subsets of annotated entity mentions and then train three Entity Linking models on this data and evaluate the relative impact of imprecise annotation on entity recognition and disambiguation performances. We demonstrate that the magnitude of effects caused by noise in the Named Entity Recognition phase is dependent on both model complexity and noise ratio, while Entity Disambiguation components are susceptible to entity boundary imprecision due to strong vocabulary dependency.",
+    "authors": [
+      "Gabriel Herman Bernardim Andrade",
+      "Shuntaro Yada",
+      "Eiji Aramaki"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.458",
+    "point2d": [
+      35.11610412597656,
+      -81.78349304199219
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 460,
+    "title": "Do CoNLL-2003 Named Entity Taggers Still Work Well in 2023?",
+    "abstract": "The CoNLL-2003 English named entity recognition (NER) dataset has been widely used to train and evaluate NER models for almost 20 years. However, it is unclear how well models that are trained on this 20-year-old data and developed over a period of decades using the same test set will perform when applied on modern data. In this paper, we evaluate the generalization of over 20 different models trained on CoNLL-2003, and show that NER models have very different generalization. Surprisingly, we find no evidence of performance degradation in pre-trained Transformers, such as RoBERTa and T5, even when fine-tuned using decades-old data. We investigate why some models generalize well to new data while others do not, and attempt to disentangle the effects of temporal drift and overfitting due to test reuse. Our analysis suggests that most deterioration is due to temporal mismatch between the pre-training corpora and the downstream test sets. We found that four factors are important for good generalization: model architecture, number of parameters, time period of the pre-training corpus, in addition to the amount of fine-tuning data. We suggest current evaluation methods have, in some sense, underestimated progress on NER over the past 20 years, as NER models have not only improved on the original CoNLL-2003 test set, but improved even more on modern data. Our datasets can be found at https://github.com/ShuhengL/acl2023_conllpp.",
+    "authors": [
+      "Shuheng Liu",
+      "Alan Ritter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.459",
+    "point2d": [
+      29.091581344604492,
+      -83.22134399414062
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 461,
+    "title": "READIN: A Chinese Multi-Task Benchmark with Realistic and Diverse Input Noises",
+    "abstract": "For many real-world applications, the user-generated inputs usually contain various noises due to speech recognition errors caused by linguistic variations or typographical errors (typos). Thus, it is crucial to test model performance on data with realistic input noises to ensure robustness and fairness. However, little study has been done to construct such benchmarks for Chinese, where various language-specific input noises happen in the real world. In order to fill this important gap, we construct READIN: a Chinese multi-task benchmark with REalistic And Diverse Input Noises. READIN contains four diverse tasks and requests annotators to re-enter the original test data with two commonly used Chinese input methods: Pinyin input and speech input. We designed our annotation pipeline to maximize diversity, for example by instructing the annotators to use diverse input method editors (IMEs) for keyboard noises and recruiting speakers from diverse dialectical groups for speech noises. We experiment with a series of strong pretrained language models as well as robust training methods, we find that these models often suffer significant performance drops on READIN even with robustness methods like data augmentation. As the first large-scale attempt in creating a benchmark with noises geared towards user-generated inputs, we believe that READIN serves as an important complement to existing Chinese NLP benchmarks. The source code and dataset can be obtained from https://github.com/ thunlp/READIN.",
+    "authors": [
+      "Chenglei Si",
+      "Zhengyan Zhang",
+      "Yingfa Chen",
+      "Xiaozhi Wang",
+      "Zhiyuan Liu",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.460",
+    "point2d": [
+      -40.77562713623047,
+      5.138546466827393
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 462,
+    "title": "MAD-TSC: A Multilingual Aligned News Dataset for Target-dependent Sentiment Classification",
+    "abstract": "Target-dependent sentiment classification (TSC) enables a fine-grained automatic analysis of sentiments expressed in texts. Sentiment expression varies depending on the domain, and it is necessary to create domain-specific datasets.While socially important, TSC in the news domain remains relatively understudied.We introduce MAD-TSC, a new dataset which differs substantially from existing resources.First, it includes aligned examples in eight languages to facilitate a comparison of performance for individual languages, and a direct comparison of human and machine translation. Second, the dataset is sampled from a diversified parallel news corpus, and is diversified in terms of news sources and geographic spread of entities.Finally, MAD-TSC is more challenging than existing datasets because its examples are more complex.We exemplify the use of MAD-TSC with comprehensive monolingual and multilingual experiments.The latter show that machine translations can successfully replace manual ones, and that performance for all included languages can match that of English by automatically translating test examples.",
+    "authors": [
+      "Evan Dufraisse",
+      "Adrian Popescu",
+      "Julien Tourille",
+      "Armelle Brun",
+      "Jerome Deshayes"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.461",
+    "point2d": [
+      14.002432823181152,
+      -31.631437301635742
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 463,
+    "title": "A New Dataset and Empirical Study for Sentence Simplification in Chinese",
+    "abstract": "Sentence Simplification is a valuable technique that can benefit language learners and children a lot. However, current research focuses more on English sentence simplification. The development of Chinese sentence simplification is relatively slow due to the lack of data. To alleviate this limitation, this paper introduces CSS, a new dataset for assessing sentence simplification in Chinese. We collect manual simplifications from human annotators and perform data analysis to show the difference between English and Chinese sentence simplifications. Furthermore, we test several unsupervised and zero/few-shot learning methods on CSS and analyze the automatic evaluation and human evaluation results. In the end, we explore whether Large Language Models can serve as high-quality Chinese sentence simplification systems by evaluating them on CSS.",
+    "authors": [
+      "Shiping Yang",
+      "Renliang Sun",
+      "Xiaojun Wan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.462",
+    "point2d": [
+      -30.930971145629883,
+      28.485836029052734
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 464,
+    "title": "Factual or Contextual? Disentangling Error Types in Entity Description Generation",
+    "abstract": "In the task of entity description generation, given a context and a specified entity, a model must describe that entity correctly and in a contextually-relevant way. In this task, as well as broader language generation tasks, the generation of a nonfactual description (factual error) versus an incongruous description (contextual error) is fundamentally different, yet often conflated. We develop an evaluation paradigm that enables us to disentangle these two types of errors in naturally occurring textual contexts. We find that factuality and congruity are often at odds, and that models specifically struggle with accurate descriptions of entities that are less familiar to people. This shortcoming of language models raises concerns around the trustworthiness of such models, since factual errors on less well-known entities are exactly those that a human reader will not recognize.",
+    "authors": [
+      "Navita Goyal",
+      "Ani Nenkova",
+      "Hal Daum\u00e9 III"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.463",
+    "point2d": [
+      47.71646499633789,
+      4.238575458526611
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 465,
+    "title": "Weakly Supervised Vision-and-Language Pre-training with Relative Representations",
+    "abstract": "Weakly supervised vision-and-language pre-training (WVLP), which learns cross-modal representations with limited cross-modal supervision, has been shown to effectively reduce the data cost of pre-training while maintaining decent performance on downstream tasks. However, current WVLP methods use only local descriptions of images, i.e., object tags, as cross-modal anchors to construct weakly-aligned image-text pairs for pre-training. This affects the data quality and thus the effectiveness of pre-training. In this paper, we propose to directly take a small number of aligned image-text pairs as anchors, and represent each unaligned image and text by its similarities to these anchors, i.e., relative representations. We build a WVLP framework based on the relative representations, namely RELIT, which collects high-quality weakly-aligned image-text pairs from large-scale image-only and text-only data for pre-training through relative representation-based retrieval and generation. Experiments on four downstream tasks show that RELIT achieves new state-of-the-art results under the weakly supervised setting.",
+    "authors": [
+      "Chi Chen",
+      "Peng Li",
+      "Maosong Sun",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.464",
+    "point2d": [
+      -58.26029586791992,
+      35.49082946777344
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 466,
+    "title": "HermEs: Interactive Spreadsheet Formula Prediction via Hierarchical Formulet Expansion",
+    "abstract": "We propose HermEs, the first approach for spreadsheet formula prediction via HiEraRchical forMulet ExpanSion, where hierarchical expansion means generating formulas following the underlying parse tree structure, and Formulet refers to commonly-used multi-level patterns mined from real formula parse trees. HermEs improves the formula prediction accuracy by (1) guaranteeing correct grammar by hierarchical generation rather than left-to-right generation and (2) significantly streamlining the token-level decoding with high-level Formulet. Notably, instead of generating formulas in a pre-defined fixed order, we propose a novel sampling strategy to systematically exploit a variety of hierarchical and multi-level expansion orders and provided solid mathematical proof, with the aim of meeting diverse human needs of the formula writing order in real applications. We further develop an interactive formula completion interface based on HermEs, which shows a new user experience in https://github.com/formulet/HERMES.",
+    "authors": [
+      "Wanrong He",
+      "Haoyu Dong",
+      "Yihuai Gao",
+      "Zhichao Fan",
+      "Xingzhuo Guo",
+      "Zhitao Hou",
+      "Xiao Lv",
+      "Ran Jia",
+      "Shi Han",
+      "Dongmei Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.465",
+    "point2d": [
+      42.278385162353516,
+      -22.20280647277832
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 467,
+    "title": "ArgU: A Controllable Factual Argument Generator",
+    "abstract": "Effective argumentation is essential towards a purposeful conversation with a satisfactory outcome. For example, persuading someone to reconsider smoking might involve empathetic, well founded arguments based on facts and expert opinions about its ill-effects and the consequences on one\u2019s family. However, the automatic generation of high-quality factual arguments can be challenging. Addressing existing controllability issues can make the recent advances in computational models for argument generation a potential solution. In this paper, we introduce ArgU: a neural argument generator capable of producing factual arguments from input facts and real-world concepts that can be explicitly controlled for stance and argument structure using Walton\u2019s argument scheme-based control codes. Unfortunately, computational argument generation is a relatively new field and lacks datasets conducive to training. Hence, we have compiled and released an annotated corpora of 69,428 arguments spanning six topics and six argument schemes, making it the largest publicly available corpus for identifying argument schemes; the paper details our annotation and dataset creation framework. We further experiment with an argument generation strategy that establishes an inference strategy by generating an \u201cargument template\u201d before actual argument generation. Our results demonstrate that it is possible to automatically generate diverse arguments exhibiting different inference patterns for the same set of facts by using control codes based on argument schemes and stance.",
+    "authors": [
+      "Sougata Saha",
+      "Rohini Srihari"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.466",
+    "point2d": [
+      40.57890701293945,
+      38.12057113647461
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 468,
+    "title": "Learning Answer Generation using Supervision from Automatic Question Answering Evaluators",
+    "abstract": "Recent studies show that sentence-level extractive QA, i.e., based on Answer Sentence Selection (AS2), is outperformed by Generation-based QA (GenQA) models, which generate answers using the top-k answer sentences ranked by AS2 models (a la retrieval-augmented generation style). In this paper, we propose a novel training paradigm for GenQA using supervision from automatic QA evaluation models (GAVA). Specifically, we propose three strategies to transfer knowledge from these QA evaluation models to a GenQA model: (i) augmenting training data with answers generated by the GenQA model and labelled by GAVA (either statically, before training, or (ii) dynamically, at every training epoch); and (iii) using the GAVA score for weighting the generator loss during the learning of the GenQA model. We evaluate our proposed methods on two academic and one industrial dataset, obtaining a significant improvement in answering accuracy over the previous state of the art.",
+    "authors": [
+      "Matteo Gabburo",
+      "Siddhant Garg",
+      "Rik Koncel-Kedziorski",
+      "Alessandro Moschitti"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.467",
+    "point2d": [
+      63.85960006713867,
+      14.71417236328125
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 469,
+    "title": "RECAP: Retrieval-Enhanced Context-Aware Prefix Encoder for Personalized Dialogue Response Generation",
+    "abstract": "Endowing chatbots with a consistent persona is essential to an engaging conversation, yet it remains an unresolved challenge. In this work, we propose a new retrieval-enhanced approach for personalized response generation. Specifically, we design a hierarchical transformer retriever trained on dialogue domain data to perform personalized retrieval and a context-aware prefix encoder that fuses the retrieved information to the decoder more effectively. Extensive experiments on a real-world dataset demonstrate the effectiveness of our model at generating more fluent and personalized responses. We quantitatively evaluate our model\u2019s performance under a suite of human and automatic metrics and find it to be superior compared to state-of-the-art baselines on English Reddit conversations.",
+    "authors": [
+      "Shuai Liu",
+      "Hyundong Cho",
+      "Marjorie Freedman",
+      "Xuezhe Ma",
+      "Jonathan May"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.468",
+    "point2d": [
+      10.924842834472656,
+      64.31208801269531
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 470,
+    "title": "Don\u2019t Parse, Choose Spans! Continuous and Discontinuous Constituency Parsing via Autoregressive Span Selection",
+    "abstract": "We present a simple and unified approach for both continuous and discontinuous constituency parsing via autoregressive span selection. Constituency parsing aims to produce a set of non-crossing spans so that they can form a constituency parse tree. We sort gold spans using a predefined order and leverage a pointer network to autoregressively select spans by that order. To deal with discontinuous spans, we consecutively select their subspans from left to right, label all but last subspans with special discontinuous labels and the last subspan as the whole discontinuous spans\u2019 labels. We use simple heuristic to output valid trees so that our approach is able to predict all possible continuous and discontinuous constituency trees without sacrificing data coverage and without the need to use expensive chart-based parsing algorithms. Experiments on multiple continuous and discontinuous benchmarks show that our model achieves state-of-the-art or competitive performance.",
+    "authors": [
+      "Songlin Yang",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.469",
+    "point2d": [
+      -25.299530029296875,
+      -63.15915298461914
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 471,
+    "title": "Laziness Is a Virtue When It Comes to Compositionality in Neural Semantic Parsing",
+    "abstract": "Nearly all general-purpose neural semantic parsers generate logical forms in a strictly top-down autoregressive fashion. Though such systems have achieved impressive results across a variety of datasets and domains, recent works have called into question whether they are ultimately limited in their ability to compositionally generalize. In this work, we approach semantic parsing from, quite literally, the opposite direction; that is, we introduce a neural semantic parsing generation method that constructs logical forms from the bottom up, beginning from the logical form\u2019s leaves. The system we introduce is lazy in that it incrementally builds up a set of potential semantic parses, but only expands and processes the most promising candidate parses at each generation step. Such a parsimonious expansion scheme allows the system to maintain an arbitrarily large set of parse hypotheses that are never realized and thus incur minimal computational overhead. We evaluate our approach on compositional generalization; specifically, on the challenging CFQ dataset and two other Text-to-SQL datasets where we show that our novel, bottom-up semantic parsing technique outperforms general-purpose semantic parsers while also being competitive with semantic parsers that have been tailored to each task.",
+    "authors": [
+      "Maxwell Crouse",
+      "Pavan Kapanipathi",
+      "Subhajit Chaudhury",
+      "Tahira Naseem",
+      "Ramon Fernandez Astudillo",
+      "Achille Fokoue",
+      "Tim Klinger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.470",
+    "point2d": [
+      -28.535757064819336,
+      -58.16917419433594
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 472,
+    "title": "AD-KD: Attribution-Driven Knowledge Distillation for Language Model Compression",
+    "abstract": "Knowledge distillation has attracted a great deal of interest recently to compress large language models. However, existing knowledge distillation methods suffer from two limitations. First, the student model simply imitates the teacher\u2019s behavior while ignoring the reasoning behind it. Second, these methods usually focus on the transfer of sophisticated model-specific knowledge but overlook data-specific knowledge. In this paper, we present a novel attribution-driven knowledge distillation approach, which explores the token-level rationale behind the teacher model based on Integrated Gradients (IG) and transfers attribution knowledge to the student model. To enhance the knowledge transfer of model reasoning and generalization, we further explore multi-view attribution distillation on all potential decisions of the teacher. Comprehensive experiments are conducted with BERT on the GLUE benchmark. The experimental results demonstrate the superior performance of our approach to several state-of-the-art methods.",
+    "authors": [
+      "Siyue Wu",
+      "Hongzhan Chen",
+      "Xiaojun Quan",
+      "Qifan Wang",
+      "Rui Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.471",
+    "point2d": [
+      -48.378482818603516,
+      -21.490867614746094
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 473,
+    "title": "(QA)^2: Question Answering with Questionable Assumptions",
+    "abstract": "Naturally occurring information-seeking questions often contain questionable assumptions\u2014assumptions that are false or unverifiable. Questions containing questionable assumptions are challenging because they require a distinct answer strategy that deviates from typical answers for information-seeking questions. For instance, the question \u201cWhen did Marie Curie discover Uranium?\u201d cannot be answered as a typical \u201cwhen\u201d question without addressing the false assumption \u201cMarie Curie discovered Uranium\u201d. In this work, we propose (QA)2 (Question Answering with Questionable Assumptions), an open-domain evaluation dataset consisting of naturally occurring search engine queries that may or may not contain questionable assumptions. To be successful on (QA)2, systems must be able to detect questionable assumptions and also be able to produce adequate responses for both typical information-seeking questions and ones with questionable assumptions. Through human rater acceptability on end-to-end QA with (QA)2, we find that current models do struggle with handling questionable assumptions, leaving substantial headroom for progress.",
+    "authors": [
+      "Najoung Kim",
+      "Phu Mon Htut",
+      "Samuel R. Bowman",
+      "Jackson Petty"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.472",
+    "point2d": [
+      63.58429718017578,
+      -1.164168119430542
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 474,
+    "title": "Attributable and Scalable Opinion Summarization",
+    "abstract": "We propose a method for unsupervised opinion summarization that encodes sentences from customer reviews into a hierarchical discrete latent space, then identifies common opinions based on the frequency of their encodings. We are able to generate both abstractive summaries by decoding these frequent encodings, and extractive summaries by selecting the sentences assigned to the same frequent encodings. Our method is attributable, because the model identifies sentences used to generate the summary as part of the summarization process. It scales easily to many hundreds of input reviews, because aggregation is performed in the latent space rather than over long sequences of tokens. We also demonstrate that our appraoch enables a degree of control, generating aspect-specific summaries by restricting the model to parts of the encoding space that correspond to desired aspects (e.g., location or food). Automatic and human evaluation on two datasets from different domains demonstrates that our method generates summaries that are more informative than prior work and better grounded in the input reviews.",
+    "authors": [
+      "Tom Hosking",
+      "Hao Tang",
+      "Mirella Lapata"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.473",
+    "point2d": [
+      -2.294185161590576,
+      38.85062026977539
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 475,
+    "title": "Targeted Data Generation: Finding and Fixing Model Weaknesses",
+    "abstract": "Even when aggregate accuracy is high, state-of-the-art NLP models often fail systematically on specific subgroups of data, resulting in unfair outcomes and eroding user trust. Additional data collection may not help in addressing these weaknesses, as such challenging subgroups may be unknown to users, and underrepresented in the existing and new data. We propose Targeted Data Generation (TDG), a framework that automatically identifies challenging subgroups, and generates new data for those subgroups using large language models (LLMs) with a human in the loop. TDG estimates the expected benefit and potential harm of data augmentation for each subgroup, and selects the ones most likely to improve within-group performance without hurting overall performance. In our experiments, TDG significantly improves the accuracy on challenging subgroups for state-of-the-art sentiment analysis and natural language inference models, while also improving overall test accuracy.",
+    "authors": [
+      "Zexue He",
+      "Marco Tulio Ribeiro",
+      "Fereshte Khani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.474",
+    "point2d": [
+      15.294604301452637,
+      26.146631240844727
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 476,
+    "title": "HiFi: High-Information Attention Heads Hold for Parameter-Efficient Model Adaptation",
+    "abstract": "To fully leverage the advantages of large-scale pre-trained language models (PLMs) on downstream tasks, it has become a ubiquitous adaptation paradigm to fine-tune the entire parameters of PLMs. However, this paradigm poses issues of inefficient updating and resource over-consuming for fine-tuning in data-scarce and resource-limited scenarios, because of the large scale of parameters in PLMs. To alleviate these concerns, in this paper, we propose a parameter-efficient fine-tuning method HiFi, that is, only the highly informative and strongly correlated attention heads for the specific task are fine-tuned. To search for those significant attention heads, we develop a novel framework to analyze the effectiveness of heads. Specifically, we first model the relationship between heads into a graph from two perspectives of information richness and correlation, and then apply PageRank algorithm to determine the relative importance of each head. Extensive experiments on the GLUE benchmark demonstrate the effectiveness of our method, and show that HiFi obtains state-of-the-art performance over the prior baselines.",
+    "authors": [
+      "Anchun Gui",
+      "Han Xiao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.475",
+    "point2d": [
+      -35.661277770996094,
+      -16.408191680908203
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 477,
+    "title": "CFSum Coarse-to-Fine Contribution Network for Multimodal Summarization",
+    "abstract": "Multimodal summarization usually suffers from the problem that the contribution of the visual modality is unclear. Existing multimodal summarization approaches focus on designing the fusion methods of different modalities, while ignoring the adaptive conditions under which visual modalities are useful. Therefore, we propose a novel Coarse-to-Fine contribution network for multimodal Summarization (CFSum) to consider different contributions of images for summarization. First, to eliminate the interference of useless images, we propose a pre-filter module to abandon useless images. Second, to make accurate use of useful images, we propose two levels of visual complement modules, word level and phrase level. Specifically, image contributions are calculated and are adopted to guide the attention of both textual and visual modalities. Experimental results have shown that CFSum significantly outperforms multiple strong baselines on the standard benchmark. Furthermore, the analysis verifies that useful images can even help generate non-visual words which are implicitly represented in the image.",
+    "authors": [
+      "Min Xiao",
+      "Junnan Zhu",
+      "Haitao Lin",
+      "Yu Zhou",
+      "Chengqing Zong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.476",
+    "point2d": [
+      -40.98230743408203,
+      54.723167419433594
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 478,
+    "title": "On \u201cScientific Debt\u201d in NLP: A Case for More Rigour in Language Model Pre-Training Research",
+    "abstract": "This evidence-based position paper critiques current research practices within the language model pre-training literature. Despite rapid recent progress afforded by increasingly better pre-trained language models (PLMs), current PLM research practices often conflate different possible sources of model improvement, without conducting proper ablation studies and principled comparisons between different models under comparable conditions. These practices (i) leave us ill-equipped to understand which pre-training approaches should be used under what circumstances; (ii) impede reproducibility and credit assignment; and (iii) render it difficult to understand: \u201cHow exactly does each factor contribute to the progress that we have today?\u201d We provide a case in point by revisiting the success of BERT over its baselines, ELMo and GPT-1, and demonstrate how \u2014 under comparable conditions where the baselines are tuned to a similar extent \u2014 these baselines (and even-simpler variants thereof) can, in fact, achieve competitive or better performance than BERT. These findings demonstrate how disentangling different factors of model improvements can lead to valuable new insights. We conclude with recommendations for how to encourage and incentivize this line of work, and accelerate progress towards a better and more systematic understanding of what factors drive the progress of our foundation models today.",
+    "authors": [
+      "Made Nindyatama Nityasya",
+      "Haryo Wibowo",
+      "Alham Fikri Aji",
+      "Genta Winata",
+      "Radityo Eko Prasojo",
+      "Phil Blunsom",
+      "Adhiguna Kuncoro"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.477",
+    "point2d": [
+      -29.907058715820312,
+      -30.959827423095703
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 479,
+    "title": "End-to-end Knowledge Retrieval with Multi-modal Queries",
+    "abstract": "We investigate knowledge retrieval with multi-modal queries, i.e. queries containing information split across image and text inputs, a challenging task that differs from previous work on cross-modal retrieval. We curate a new dataset called ReMuQ for benchmarking progress on this task. ReMuQ requires a system to retrieve knowledge from a large corpus by integrating contents from both text and image queries. We introduce a retriever model \u201cReViz\u201d that can directly process input text and images to retrieve relevant knowledge in an end-to-end fashion without being dependent on intermediate modules such as object detectors or caption generators. We introduce a new pretraining task that is effective for learning knowledge retrieval with multimodal queries and also improves performance on downstream tasks. We demonstrate superior performance in retrieval on two datasets (ReMuQ and OK-VQA) under zero-shot settings as well as further improvements when finetuned on these datasets.",
+    "authors": [
+      "Man Luo",
+      "Zhiyuan Fang",
+      "Tejas Gokhale",
+      "Yezhou Yang",
+      "Chitta Baral"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.478",
+    "point2d": [
+      7.633042335510254,
+      -12.242733001708984
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 480,
+    "title": "AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation",
+    "abstract": "Direct speech-to-speech translation (S2ST) aims to convert speech from one language into another, and has demonstrated significant progress to date. Despite the recent success, current S2ST models still suffer from distinct degradation in noisy environments and fail to translate visual speech (i.e., the movement of lips and teeth). In this work, we present AV-TranSpeech, the first audio-visual speech-to-speech (AV-S2ST) translation model without relying on intermediate text. AV-TranSpeech complements the audio stream with visual information to promote system robustness and opens up a host of practical applications: dictation or dubbing archival films. To mitigate the data scarcity with limited parallel AV-S2ST data, we 1) explore self-supervised pre-training with unlabeled audio-visual data to learn contextual representation, and 2) introduce cross-modal distillation with S2ST models trained on the audio-only corpus to further reduce the requirements of visual data. Experimental results on two language pairs demonstrate that AV-TranSpeech outperforms audio-only models under all settings regardless of the type of noise. With low-resource audio-visual data (10h, 30h), cross-modal distillation yields an improvement of 7.6 BLEU on average compared with baselines.Audio samples are available at https://AV-TranSpeech.github.io/.",
+    "authors": [
+      "Rongjie Huang",
+      "Huadai Liu",
+      "Xize Cheng",
+      "Yi Ren",
+      "Linjun Li",
+      "Zhenhui Ye",
+      "Jinzheng He",
+      "Lichao Zhang",
+      "Jinglin Liu",
+      "Xiang Yin",
+      "Zhou Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.479",
+    "point2d": [
+      -69.02816772460938,
+      25.041669845581055
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 481,
+    "title": "Dual Class Knowledge Propagation Network for Multi-label Few-shot Intent Detection",
+    "abstract": "Multi-label intent detection aims to assign multiple labels to utterances and attracts increasing attention as a practical task in task-oriented dialogue systems. As dialogue domains change rapidly and new intents emerge fast, the lack of annotated data motivates multi-label few-shot intent detection. However, previous studies are confused by the identical representation of the utterance with multiple labels and overlook the intrinsic intra-class and inter-class interactions. To address these two limitations, we propose a novel dual class knowledge propagation network in this paper. In order to learn well-separated representations for utterances with multiple intents, we first introduce a label-semantic augmentation module incorporating class name information. For better consideration of the inherent intra-class and inter-class relations, an instance-level and a class-level graph neural network are constructed, which not only propagate label information but also propagate feature structure. And we use a simple yet effective method to predict the intent count of each utterance. Extensive experimental results on two multi-label intent datasets have demonstrated that our proposed method outperforms strong baselines by a large margin.",
+    "authors": [
+      "Feng Zhang",
+      "Wei Chen",
+      "Fei Ding",
+      "Tengjiao Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.480",
+    "point2d": [
+      -7.57188081741333,
+      74.81779479980469
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 482,
+    "title": "VendorLink: An NLP approach for Identifying & Linking Vendor Migrants & Potential Aliases on Darknet Markets",
+    "abstract": "The anonymity on the Darknet allows vendors to stay undetected by using multiple vendor aliases or frequently migrating between markets. Consequently, illegal markets and their connections are challenging to uncover on the Darknet. To identify relationships between illegal markets and their vendors, we propose VendorLink, an NLP-based approach that examines writing patterns to verify, identify, and link unique vendor accounts across text advertisements (ads) on seven public Darknet markets. In contrast to existing literature, VendorLink utilizes the strength of supervised pre-training to perform closed-set vendor verification, open-set vendor identification, and low-resource market adaption tasks. Through VendorLink, we uncover (i) 15 migrants and 71 potential aliases in the Alphabay-Dreams-Silk dataset, (ii) 17 migrants and 3 potential aliases in the Valhalla-Berlusconi dataset, and (iii) 75 migrants and 10 potential aliases in the Traderoute-Agora dataset. Altogether, our approach can help Law Enforcement Agencies (LEA) make more informed decisions by verifying and identifying migrating vendors and their potential aliases on existing and Low-Resource (LR) emerging Darknet markets.",
+    "authors": [
+      "Vageesh Saxena",
+      "Nils Rethmeier",
+      "Gijs van Dijck",
+      "Gerasimos Spanakis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.481",
+    "point2d": [
+      31.278486251831055,
+      23.864391326904297
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 483,
+    "title": "Element-aware Summarization with Large Language Models: Expert-aligned Evaluation and Chain-of-Thought Method",
+    "abstract": "Automatic summarization generates concise summaries that contain key ideas of source documents.As the most mainstream datasets for the news sub-domain, CNN/DailyMail and BBC XSum have been widely used for performance benchmarking. However, the reference summaries of those datasets turn out to be noisy, mainly in terms of factual hallucination and information redundancy. To address this challenge, we first annotate new expert-writing Element-aware test sets following the \u201cLasswell Communication Model\u201d proposed by Lasswell, allowing reference summaries to focus on more fine-grained news elements objectively and comprehensively. Utilizing the new test sets, we observe the surprising zero-shot summary ability of LLMs, which addresses the issue of the inconsistent results between human preference and automatic evaluation metrics of LLMs\u2019 zero-shot summaries in prior work. Further, we propose a Summary Chain-of-Thought (SumCoT) technique to elicit LLMs to generate summaries step by step, which helps them integrate more fine-grained details of source documents into the final summaries that correlate with the human writing mindset. Experimental results show our method outperforms state-of-the-art fine-tuned PLMs and zero-shot LLMs by +4.33/+4.77 in ROUGE-L on the two datasets, respectively. Dataset and code are publicly available at https://github.com/Alsace08/SumCoT.",
+    "authors": [
+      "Yiming Wang",
+      "Zhuosheng Zhang",
+      "Rui Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.482",
+    "point2d": [
+      -7.417372226715088,
+      39.53620910644531
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 484,
+    "title": "Efficient Shapley Values Estimation by Amortization for Text Classification",
+    "abstract": "Despite the popularity of Shapley Values in explaining neural text classification models, computing them is prohibitive for large pretrained models due to a large number of model evaluations. In practice, Shapley Values are often estimated with a small number of stochastic model evaluations. However, we show that the estimated Shapley Values are sensitive to random seed choices \u2013 the top-ranked features often have little overlap across different seeds, especially on examples with longer input texts. This can only be mitigated by aggregating thousands of model evaluations, which on the other hand, induces substantial computational overheads. To mitigate the trade-off between stability and efficiency, we develop an amortized model that directly predicts each input feature\u2019s Shapley Value without additional model evaluations. It is trained on a set of examples whose Shapley Values are estimated from a large number of model evaluations to ensure stability. Experimental results on two text classification datasets demonstrate that our amortized model estimates Shapley Values accurately with up to 60 times speedup compared to traditional methods. Further, our model does not suffer from stability issues as inference is deterministic. We release our code at https://github.com/yangalan123/Amortized-Interpretability.",
+    "authors": [
+      "Chenghao Yang",
+      "Fan Yin",
+      "He He",
+      "Kai-Wei Chang",
+      "Xiaofei Ma",
+      "Bing Xiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.483",
+    "point2d": [
+      -9.100380897521973,
+      -0.974315345287323
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 485,
+    "title": "PeerDA: Data Augmentation via Modeling Peer Relation for Span Identification Tasks",
+    "abstract": "Span identification aims at identifying specific text spans from text input and classifying them into pre-defined categories. Different from previous works that merely leverage the Subordinate (SUB) relation (i.e. if a span is an instance of a certain category) to train models, this paper for the first time explores the Peer (PR) relation, which indicates that two spans are instances of the same category and share similar features. Specifically, a novel Peer Data Augmentation (PeerDA) approach is proposed which employs span pairs with the PR relation as the augmentation data for training. PeerDA has two unique advantages: (1) There are a large number of PR span pairs for augmenting the training data. (2) The augmented data can prevent the trained model from over-fitting the superficial span-category mapping by pushing the model to leverage the span semantics. Experimental results on ten datasets over four diverse tasks across seven domains demonstrate the effectiveness of PeerDA. Notably, PeerDA achieves state-of-the-art results on six of them.",
+    "authors": [
+      "Weiwen Xu",
+      "Xin Li",
+      "Yang Deng",
+      "Wai Lam",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.484",
+    "point2d": [
+      -21.098365783691406,
+      -50.9876708984375
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 486,
+    "title": "Dynamic Regularization in UDA for Transformers in Multimodal Classification",
+    "abstract": "Multimodal machine learning is a cutting-edge field that explores ways to incorporate information from multiple sources into models. As more multimodal data becomes available, this field has become increasingly relevant. This work focuses on two key challenges in multimodal machine learning. The first is finding efficient ways to combine information from different data types. The second is that often, one modality (e.g., text) is stronger and more relevant, making it difficult to identify meaningful patterns in the weaker modality (e.g., image). Our approach focuses on more effectively exploiting the weaker modality while dynamically regularizing the loss function. First, we introduce a new two-stream model called Multimodal BERT-ViT, which features a novel intra-CLS token fusion. Second, we utilize a dynamic adjustment that maintains a balance between specialization and generalization during the training to avoid overfitting, which we devised. We add this dynamic adjustment to the Unsupervised Data Augmentation (UDA) framework. We evaluate the effectiveness of these proposals on the task of multi-label movie genre classification using the Moviescope and MM-IMDb datasets. The evaluation revealed that our proposal offers substantial benefits, while simultaneously enabling us to harness the weaker modality without compromising the information provided by the stronger.",
+    "authors": [
+      "Ivonne Monter-Aldana",
+      "Adrian Pastor Lopez Monroy",
+      "Fernando Sanchez-Vega"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.485",
+    "point2d": [
+      -41.971923828125,
+      59.138309478759766
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 487,
+    "title": "Conflicts, Villains, Resolutions: Towards models of Narrative Media Framing",
+    "abstract": "Despite increasing interest in the automatic detection of media frames in NLP, the problem is typically simplified as single-label classification and adopts a topic-like view on frames, evading modelling the broader document-level narrative. In this work, we revisit a widely used conceptualization of framing from the communication sciences which explicitly captures elements of narratives, including conflict and its resolution, and integrate it with the narrative framing of key entities in the story as heroes, victims or villains. We adapt an effective annotation paradigm that breaks a complex annotation task into a series of simpler binary questions, and present an annotated data set of English news articles, and a case study on the framing of climate change in articles from news outlets across the political spectrum. Finally, we explore automatic multi-label prediction of our frames with supervised and semi-supervised approaches, and present a novel retrieval-based method which is both effective and transparent in its predictions. We conclude with a discussion of opportunities and challenges for future work on document-level models of narrative framing.",
+    "authors": [
+      "Lea Frermann",
+      "Jiatong Li",
+      "Shima Khanehzar",
+      "Gosia Mikolajczak"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.486",
+    "point2d": [
+      37.88511276245117,
+      34.799137115478516
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 488,
+    "title": "bgGLUE: A Bulgarian General Language Understanding Evaluation Benchmark",
+    "abstract": "We present bgGLUE (Bulgarian General Language Understanding Evaluation), a benchmark for evaluating language models on Natural Language Understanding (NLU) tasks in Bulgarian. Our benchmark includes NLU tasks targeting a variety of NLP problems (e.g., natural language inference, fact-checking, named entity recognition, sentiment analysis, question answering, etc.) and machine learning tasks (sequence labeling, document-level classification, and regression). We run the first systematic evaluation of pre-trained language models for Bulgarian, comparing and contrasting results across the nine tasks in the benchmark. The evaluation results show strong performance on sequence labeling tasks, but there is a lot of room for improvement for tasks that require more complex reasoning. We make bgGLUE publicly available together with the fine-tuning and the evaluation code, as well as a public leaderboard at https://bgglue.github.io, and we hope that it will enable further advancements in developing NLU models for Bulgarian.",
+    "authors": [
+      "Momchil Hardalov",
+      "Pepa Atanasova",
+      "Todor Mihaylov",
+      "Galia Angelova",
+      "Kiril Simov",
+      "Petya Osenova",
+      "Veselin Stoyanov",
+      "Ivan Koychev",
+      "Preslav Nakov",
+      "Dragomir Radev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.487",
+    "point2d": [
+      -21.114990234375,
+      -37.21393585205078
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 489,
+    "title": "DuNST: Dual Noisy Self Training for Semi-Supervised Controllable Text Generation",
+    "abstract": "Self-training (ST) has prospered again in language understanding by augmenting the fine-tuning of big pre-trained models when labeled data is insufficient. However, it remains challenging to incorporate ST into attribute-controllable language generation. Augmented only by self-generated pseudo text, generation models over-exploit the previously learned text space and fail to explore a larger one, suffering from a restricted generalization boundary and limited controllability. In this work, we propose DuNST, a novel ST framework to tackle these problems. DuNST jointly models text generation and classification as a dual process and further perturbs and escapes from the collapsed space by adding two kinds of flexible noise. In this way, our model could construct and utilize both pseudo text generated from given labels and pseudo labels predicted from available unlabeled text, which are gradually refined during the ST phase. We theoretically demonstrate that DuNST can be regarded as enhancing the exploration of the potentially larger real text space while maintaining exploitation, guaranteeing improved performance. Experiments on three controllable generation tasks show that DuNST significantly boosts control accuracy with comparable generation fluency and diversity against several strong baselines.",
+    "authors": [
+      "Yuxi Feng",
+      "Xiaoyuan Yi",
+      "Xiting Wang",
+      "Laks Lakshmanan, V.S.",
+      "Xing Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.488",
+    "point2d": [
+      -22.259063720703125,
+      4.6148362159729
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 490,
+    "title": "What does the Failure to Reason with \u201cRespectively\u201d in Zero/Few-Shot Settings Tell Us about Language Models?",
+    "abstract": "Humans can effortlessly understand the coordinate structure of sentences such as \u201cNiels Bohr and Kurt Cobain were born in Copenhagen and Seattle, *respectively*\u201d. In the context of natural language inference (NLI), we examine how language models (LMs) reason with respective readings (Gawron and Kehler, 2004) from two perspectives: syntactic-semantic and commonsense-world knowledge. We propose a controlled synthetic dataset WikiResNLI and a naturally occurring dataset NatResNLI to encompass various explicit and implicit realizations of \u201crespectively\u201d. We show that fine-tuned NLI models struggle with understanding such readings without explicit supervision. While few-shot learning is easy in the presence of explicit cues, longer training is required when the reading is evoked implicitly, leaving models to rely on common sense inferences. Furthermore, our fine-grained analysis indicates models fail to generalize across different constructions. To conclude, we demonstrate that LMs still lag behind humans in generalizing to the long tail of linguistic constructions.",
+    "authors": [
+      "Ruixiang Cui",
+      "Seolhwa Lee",
+      "Daniel Hershcovich",
+      "Anders S\u00f8gaard"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.489",
+    "point2d": [
+      41.16029739379883,
+      -7.5361127853393555
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 491,
+    "title": "BLIND: Bias Removal With No Demographics",
+    "abstract": "Models trained on real-world data tend to imitate and amplify social biases. Common methods to mitigate biases require prior information on the types of biases that should be mitigated (e.g., gender or racial bias) and the social groups associated with each data sample. In this work, we introduce BLIND, a method for bias removal with no prior knowledge of the demographics in the dataset. While training a model on a downstream task, BLIND detects biased samples using an auxiliary model that predicts the main model\u2019s success, and down-weights those samples during the training process. Experiments with racial and gender biases in sentiment classification and occupation classification tasks demonstrate that BLIND mitigates social biases without relying on a costly demographic annotation process. Our method is competitive with other methods that require demographic information and sometimes even surpasses them.",
+    "authors": [
+      "Hadas Orgad",
+      "Yonatan Belinkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.490",
+    "point2d": [
+      14.833211898803711,
+      28.97603416442871
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 492,
+    "title": "How do humans perceive adversarial text? A reality check on the validity and naturalness of word-based adversarial attacks",
+    "abstract": "Natural Language Processing (NLP) models based on Machine Learning (ML) are susceptible to adversarial attacks \u2013 malicious algorithms that imperceptibly modify input text to force models into making incorrect predictions. However, evaluations of these attacks ignore the property of imperceptibility or study it under limited settings. This entails that adversarial perturbations would not pass any human quality gate and do not represent real threats to human-checked NLP systems. To bypass this limitation and enable proper assessment (and later, improvement) of NLP model robustness, we have surveyed 378 human participants about the perceptibility of text adversarial examples produced by state-of-the-art methods. Our results underline that existing text attacks are impractical in real-world scenarios where humans are involved. This contrasts with previous smaller-scale human studies, which reported overly optimistic conclusions regarding attack success. Through our work, we hope to position human perceptibility as a first-class success criterion for text attacks, and provide guidance for research to build effective attack algorithms and, in turn, design appropriate defence mechanisms.",
+    "authors": [
+      "Salijona Dyrmishi",
+      "Salah Ghamizi",
+      "Maxime Cordy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.491",
+    "point2d": [
+      3.6843883991241455,
+      9.124783515930176
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 493,
+    "title": "Soft Alignment Objectives for Robust Adaptation of Language Generation",
+    "abstract": "Domain adaptation allows generative language models to address specific flaws caused by the domain shift of their application.However, the traditional adaptation by further training on in-domain data rapidly weakens the model\u2019s ability to generalize to other domains, making the open-ended deployments of the adapted models prone to errors.This work introduces novel training objectives built upon a semantic similarity of the predicted tokens to the reference.Our results show that (1) avoiding the common assumption of a single correct prediction by constructing the training target from tokens\u2019 semantic similarity can largely mitigate catastrophic forgetting of adaptation, while (2) preserving the adaptation in-domain quality, (3) with negligible additions to compute costs.In the broader context, the objectives grounded in a continuous token similarity pioneer the exploration of the middle ground between the efficient but naive exact-match token-level objectives and expressive but computationally- and resource-intensive sequential objectives.",
+    "authors": [
+      "Michal \u0160tef\u00e1nik",
+      "Marek Kadlcik",
+      "Petr Sojka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.492",
+    "point2d": [
+      -25.588451385498047,
+      2.1612539291381836
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 494,
+    "title": "The CRINGE Loss: Learning what language not to model",
+    "abstract": "Standard language model training employs gold human documents or human-human interaction data, and treats all training data as positive examples. Growing evidence shows that even with very large amounts of positive training data, issues remain that can be alleviated with relatively small amounts of negative data \u2013 examples of what the model should not do. In this work, we propose a novel procedure to train with such data called the \u201cCRINGE\u201d loss (ContRastive Iterative Negative GEneration). We show the effectiveness of this approach across three different experiments on the tasks of safe generation, contradiction avoidance, and open-domain dialogue. Our models outperform multiple strong baselines and are conceptually simple, easy to train and implement.",
+    "authors": [
+      "Leonard Adolphs",
+      "Tianyu Gao",
+      "Jing Xu",
+      "Kurt Shuster",
+      "Sainbayar Sukhbaatar",
+      "Jason Weston"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.493",
+    "point2d": [
+      6.929537773132324,
+      57.12458038330078
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 495,
+    "title": "Modeling User Satisfaction Dynamics in Dialogue via Hawkes Process",
+    "abstract": "Dialogue systems have received increasing attention while automatically evaluating their performance remains challenging. User satisfaction estimation (USE) has been proposed as an alternative. It assumes that the performance of a dialogue system can be measured by user satisfaction and uses an estimator to simulate users. The effectiveness of USE depends heavily on the estimator. Existing estimators independently predict user satisfaction at each turn and ignore satisfaction dynamics across turns within a dialogue. In order to fully simulate users, it is crucial to take satisfaction dynamics into account. To fill this gap, we propose a new estimator ASAP (sAtisfaction eStimation via HAwkes Process) that treats user satisfaction across turns as an event sequence and employs a Hawkes process to effectively model the dynamics in this sequence. Experimental results on four benchmark dialogue datasets demonstrate that ASAP can substantially outperform state-of-the-art baseline estimators.",
+    "authors": [
+      "Fanghua Ye",
+      "Zhiyuan Hu",
+      "Emine Yilmaz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.494",
+    "point2d": [
+      18.141172409057617,
+      73.69038391113281
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 496,
+    "title": "Towards Identifying Fine-Grained Depression Symptoms from Memes",
+    "abstract": "The past decade has observed significant attention toward developing computational methods for classifying social media data based on the presence or absence of mental health conditions. In the context of mental health, for clinicians to make an accurate diagnosis or provide personalized intervention, it is crucial to identify fine-grained mental health symptoms. To this end, we conduct a focused study on depression disorder and introduce a new task of identifying fine-grained depressive symptoms from memes. Toward this, we create a high-quality dataset (RESTORE) annotated with 8 fine-grained depression symptoms based on the clinically adopted PHQ-9 questionnaire.We benchmark RESTORE on 20 strong monomodal and multimodal methods. Additionally, we show how imposing orthogonal constraints on textual and visual feature representations in a multimodal setting can enforce the model to learn non-redundant and de-correlated features leading to a better prediction of fine-grained depression symptoms. Further, we conduct an extensive human analysis and elaborate on the limitations of existing multimodal models that often overlook the implicit connection between visual and textual elements of a meme.",
+    "authors": [
+      "Shweta Yadav",
+      "Cornelia Caragea",
+      "Chenye Zhao",
+      "Naincy Kumari",
+      "Marvin Solberg",
+      "Tanmay Sharma"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.495",
+    "point2d": [
+      34.30607604980469,
+      74.26921844482422
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 497,
+    "title": "SLUE Phase-2: A Benchmark Suite of Diverse Spoken Language Understanding Tasks",
+    "abstract": "Spoken language understanding (SLU) tasks have been studied for many decades in the speech research community, but have not received as much attention as lower-level tasks like speech and speaker recognition. In this work, we introduce several new annotated SLU benchmark tasks based on freely available speech data, which complement existing benchmarks and address gaps in the SLU evaluation landscape. We contribute four tasks: question answering and summarization involve inference over longer speech sequences; named entity localization addresses the speech-specific task of locating the targeted content in the signal; dialog act classification identifies the function of a given speech utterance. In order to facilitate the development of SLU models that leverage the success of pre-trained speech representations, we will release a new benchmark suite, including for each task (i) curated annotations for a relatively small fine-tuning set, (ii) reproducible pipeline (speech recognizer + text model) and end-to-end baseline models and evaluation metrics, (iii) baseline model performance in various types of systems for easy comparisons. We present the details of data collection and annotation and the performance of the baseline models. We also analyze the sensitivity of pipeline models\u2019 performance to the speech recognition accuracy, using more than 20 publicly availablespeech recognition models.",
+    "authors": [
+      "Suwon Shon",
+      "Siddhant Arora",
+      "Chyi-Jiunn Lin",
+      "Ankita Pasad",
+      "Felix Wu",
+      "Roshan S Sharma",
+      "Wei-Lun Wu",
+      "Hung-yi Lee",
+      "Karen Livescu",
+      "Shinji Watanabe"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.496",
+    "point2d": [
+      -3.5388057231903076,
+      64.24486541748047
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 498,
+    "title": "My side, your side and the evidence: Discovering aligned actor groups and the narratives they weave",
+    "abstract": "News reports about emerging issues often include several conflicting story lines. Individual stories can be conceptualized as samples from an underlying mixture of competing narratives. The automated identification of these distinct narratives from unstructured text is a fundamental yet difficult task in Computational Linguistics since narratives are often intertwined and only implicitly conveyed in text. In this paper, we consider a more feasible proxy task: Identify the distinct sets of aligned story actors responsible for sustaining the issue-specific narratives. Discovering aligned actors, and the groups these alignments create, brings us closer to estimating the narrative that each group represents. With the help of Large Language Models (LLM), we address this task by: (i) Introducing a corpus of text segments rich in narrative content associated with six different current issues; (ii) Introducing a novel two-step graph-based framework that (a) identifies alignments between actors (INCANT) and (b) extracts aligned actor groups using the network structure (TAMPA). Amazon Mechanical Turk evaluations demonstrate the effectiveness of our framework. Across domains, alignment relationships from INCANT are accurate (macro F1 >= 0.75) and actor groups from TAMPA are preferred over 2 non-trivial baseline models (ACC >= 0.75).",
+    "authors": [
+      "Pavan Holur",
+      "David Chong",
+      "Timothy Tangherlini",
+      "Vwani Roychowdhury"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.497",
+    "point2d": [
+      37.907379150390625,
+      34.31281280517578
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 499,
+    "title": "Characterizing and Measuring Linguistic Dataset Drift",
+    "abstract": "NLP models often degrade in performance when real world data distributions differ markedly from training data. However, existing dataset drift metrics in NLP have generally not considered specific dimensions of linguistic drift that affect model performance, and they have not been validated in their ability to predict model performance at the individual example level, where such metrics are often used in practice. In this paper, we propose three dimensions of linguistic dataset drift: vocabulary, structural, and semantic drift. These dimensions correspond to content word frequency divergences, syntactic divergences, and meaning changes not captured by word frequencies (e.g. lexical semantic change). We propose interpretable metrics for all three drift dimensions, and we modify past performance prediction methods to predict model performance at both the example and dataset level for English sentiment classification and natural language inference. We find that our drift metrics are more effective than previous metrics at predicting out-of-domain model accuracies (mean 16.8% root mean square error decrease), particularly when compared to popular fine-tuned embedding distances (mean 47.7% error decrease). Fine-tuned embedding distances are much more effective at ranking individual examples by expected performance, but decomposing into vocabulary, structural, and semantic drift produces the best example rankings of all considered model-agnostic drift metrics (mean 6.7% ROC AUC increase).",
+    "authors": [
+      "Tyler Chang",
+      "Kishaloy Halder",
+      "Neha Anna John",
+      "Yogarshi Vyas",
+      "Yassine Benajiba",
+      "Miguel Ballesteros",
+      "Dan Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.498",
+    "point2d": [
+      4.745001316070557,
+      23.3106632232666
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 500,
+    "title": "WebCPM: Interactive Web Search for Chinese Long-form Question Answering",
+    "abstract": "Long-form question answering (LFQA) aims at answering complex, open-ended questions with detailed, paragraph-length responses. The de facto paradigm of LFQA necessitates two procedures: information retrieval, which searches for relevant supporting facts, and information synthesis, which integrates these facts into a coherent answer. In this paper, we introduce WebCPM, the first Chinese LFQA dataset. One unique feature of WebCPM is that its information retrieval is based on interactive web search, which engages with a search engine in real time. Following WebGPT, we develop a web search interface. We recruit annotators to search for relevant information using our interface and then answer questions. Meanwhile, the web search behaviors of our annotators would be recorded. In total, we collect 5,500 high-quality question-answer pairs, together with 15,372 supporting facts and 125,954 web search actions. We fine-tune pre-trained language models to imitate human behaviors for web search and to generate answers based on the collected facts. Our LFQA pipeline, built on these fine-tuned models, generates answers that are no worse than human-written ones in 32.5% and 47.5% of the cases on our dataset and DuReader, respectively. The interface, dataset, and codes are publicly available at https://github.com/thunlp/WebCPM.",
+    "authors": [
+      "Yujia Qin",
+      "Zihan Cai",
+      "Dian Jin",
+      "Lan Yan",
+      "Shihao Liang",
+      "Kunlun Zhu",
+      "Yankai Lin",
+      "Xu Han",
+      "Ning Ding",
+      "Huadong Wang",
+      "Ruobing Xie",
+      "Fanchao Qi",
+      "Zhiyuan Liu",
+      "Maosong Sun",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.499",
+    "point2d": [
+      70.04267120361328,
+      9.295259475708008
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 501,
+    "title": "Synthesize, Prompt and Transfer: Zero-shot Conversational Question Generation with Pre-trained Language Model",
+    "abstract": "Conversational question generation aims to generate questions that depend on both context and conversation history. Conventional works utilizing deep learning have shown promising results, but heavily rely on the availability of large-scale annotated conversations. In this paper, we introduce a more realistic and less explored setting, Zero-shot Conversational Question Generation (ZeroCQG), which requires no human-labeled conversations for training. To solve ZeroCQG, we propose a multi-stage knowledge transfer framework, Synthesize, Prompt, and trAnsfer with pRe-Trained lAnguage model (SPARTA) to effectively leverage knowledge from single-turn question generation instances. To validate the zero-shot performance of SPARTA, we conduct extensive experiments on three conversational datasets: CoQA, QuAC, and DoQA by transferring knowledge from three single-turn datasets: MS MARCO, NewsQA, and SQuAD. The experimental results demonstrate the superior performance of our method. Specifically, SPARTA has achieved 14.81 BLEU-4 (88.2% absolute improvement compared to T5) in CoQA with knowledge transferred from SQuAD.",
+    "authors": [
+      "Hongwei Zeng",
+      "Bifan Wei",
+      "Jun Liu",
+      "Weiping Fu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.500",
+    "point2d": [
+      18.69331932067871,
+      53.91120147705078
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 502,
+    "title": "FormNetV2: Multimodal Graph Contrastive Learning for Form Document Information Extraction",
+    "abstract": "The recent advent of self-supervised pre-training techniques has led to a surge in the use of multimodal learning in form document understanding. However, existing approaches that extend the mask language modeling to other modalities require careful multi-task tuning, complex reconstruction target designs, or additional pre-training data. In FormNetV2, we introduce a centralized multimodal graph contrastive learning strategy to unify self-supervised pre-training for all modalities in one loss. The graph contrastive objective maximizes the agreement of multimodal representations, providing a natural interplay for all modalities without special customization. In addition, we extract image features within the bounding box that joins a pair of tokens connected by a graph edge, capturing more targeted visual cues without loading a sophisticated and separately pre-trained image embedder. FormNetV2 establishes new state-of-the-art performance on FUNSD, CORD, SROIE and Payment benchmarks with a more compact model size.",
+    "authors": [
+      "Chen-Yu Lee",
+      "Chun-Liang Li",
+      "Hao Zhang",
+      "Timothy Dozat",
+      "Vincent Perot",
+      "Guolong Su",
+      "Xiang Zhang",
+      "Kihyuk Sohn",
+      "Nikolay Glushnev",
+      "Renshen Wang",
+      "Joshua Ainslie",
+      "Shangbang Long",
+      "Siyang Qin",
+      "Yasuhisa Fujii",
+      "Nan Hua",
+      "Tomas Pfister"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.501",
+    "point2d": [
+      -49.09010314941406,
+      43.81700134277344
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 503,
+    "title": "MixCE: Training Autoregressive Language Models by Mixing Forward and Reverse Cross-Entropies",
+    "abstract": "Autoregressive language models are trained by minimizing the cross-entropy of the model distribution Q relative to the data distribution P \u2013 that is, minimizing the forward cross-entropy, which is equivalent to maximum likelihood estimation (MLE). We have observed that models trained in this way may \u201cover-generalize\u201d, in the sense that they produce non-human-like text. Moreover, we believe that reverse cross-entropy, i.e., the cross-entropy of P relative to Q, is a better reflection of how a human would evaluate text generated by a model. Hence, we propose learning with MixCE, an objective that mixes the forward and reverse cross-entropies. We evaluate models trained with this objective on synthetic data settings (where P is known) and real data, and show that the resulting models yield better generated text without complex decoding strategies.",
+    "authors": [
+      "Shiyue Zhang",
+      "Shijie Wu",
+      "Ozan Irsoy",
+      "Steven Lu",
+      "Mohit Bansal",
+      "Mark Dredze",
+      "David Rosenberg"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.502",
+    "point2d": [
+      -39.404258728027344,
+      -0.633611798286438
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 504,
+    "title": "Knowledgeable Parameter Efficient Tuning Network for Commonsense Question Answering",
+    "abstract": "Commonsense question answering is important for making decisions about everyday matters. Although existing commonsense question answering works based on fully fine-tuned PLMs have achieved promising results, they suffer from prohibitive computation costs as well as poor interpretability. Some works improve the PLMs by incorporating knowledge to provide certain evidence, via elaborately designed GNN modules which require expertise. In this paper, we propose a simple knowledgeable parameter efficient tuning network to couple PLMs with external knowledge for commonsense question answering. Specifically, we design a trainable parameter-sharing adapter attached to a parameter-freezing PLM to incorporate knowledge at a small cost. The adapter is equipped with both entity- and query-related knowledge via two auxiliary knowledge-related tasks (i.e., span masking and relation discrimination). To make the adapter focus on the relevant knowledge, we design gating and attention mechanisms to respectively filter and fuse the query information from the PLM. Extensive experiments on two benchmark datasets show that KPE is parameter-efficient and can effectively incorporate knowledge for improving commonsense question answering.",
+    "authors": [
+      "Ziwang Zhao",
+      "Linmei Hu",
+      "Hanyu Zhao",
+      "Yingxia Shao",
+      "Yequan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.503",
+    "point2d": [
+      68.15118408203125,
+      -0.07519345730543137
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 505,
+    "title": "BLASER: A Text-Free Speech-to-Speech Translation Evaluation Metric",
+    "abstract": "End-to-End speech-to-speech translation (S2ST) is generally evaluated with text-based metrics. This means that generated speech has to be automatically transcribed, making the evaluation dependent on the availability and quality of automatic speech recognition (ASR) systems.In this paper, we propose a text-free evaluation metric for end-to-end S2ST, named BLASER, to avoid the dependency on ASR systems. BLASER leverages a multilingual multimodal encoder to directly encode the speech segments for source input, translation output and reference into a shared embedding space and computes a score of the translation quality that can be used as a proxy to human evaluation. To evaluate our approach, we construct training and evaluation sets from more than 40k human annotations covering seven language directions.The best results of BLASER are achieved by training with supervision from human rating scores. We show that when evaluated at the sentence level, BLASER correlates significantly better with human judgment compared to ASR dependent metrics including ASR-SENTBLEU in all translation directions and ASR-COMET in five of them. Our analysis shows combining speech and text as inputs to BLASER does not increase the correlation with human scores, but best correlations are achieved when using speech, which motivates the goal of our research. Moreover, we show that using ASR for references is detrimental for text-based metrics.",
+    "authors": [
+      "Mingda Chen",
+      "Paul-Ambroise Duquenne",
+      "Pierre Andrews",
+      "Justine Kao",
+      "Alexandre Mourachko",
+      "Holger Schwenk",
+      "Marta R. Costa-juss\u00e0"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.504",
+    "point2d": [
+      -71.63697052001953,
+      19.373924255371094
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 506,
+    "title": "NLPositionality: Characterizing Design Biases of Datasets and Models",
+    "abstract": "Design biases in NLP systems, such as performance differences for different populations, often stem from their creator\u2019s positionality, i.e., views and lived experiences shaped by identity and background. Despite the prevalence and risks of design biases, they are hard to quantify because researcher, system, and dataset positionality is often unobserved. We introduce NLPositionality, a framework for characterizing design biases and quantifying the positionality of NLP datasets and models. Our framework continuously collects annotations from a diverse pool of volunteer participants on LabintheWild, and statistically quantifies alignment with dataset labels and model predictions. We apply NLPositionality to existing datasets and models for two tasks\u2014social acceptability and hate speech detection. To date, we have collected 16,299 annotations in over a year for 600 instances from 1,096 annotators across 87 countries.We find that datasets and models align predominantly with Western, White, college-educated, and younger populations. Additionally, certain groups, such as non-binary people and non-native English speakers, are further marginalized by datasets and models as they rank least in alignment across all tasks. Finally, we draw from prior literature to discuss how researchers can examine their own positionality and that of their datasets and models, opening the door for more inclusive NLP systems.",
+    "authors": [
+      "Sebastin Santy",
+      "Jenny Liang",
+      "Ronan Le Bras",
+      "Katharina Reinecke",
+      "Maarten Sap"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.505",
+    "point2d": [
+      23.723237991333008,
+      27.763635635375977
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 507,
+    "title": "Backpack Language Models",
+    "abstract": "We present Backpacks: a new neural architecture that marries strong modeling performancewith an interface for interpretability and control. Backpacks learn multiple non-contextual sense vectors for each word in a vocabulary, and represent a word in a sequence as a context-dependent, non-negative linear combination ofsense vectors in this sequence. We find that, after training, sense vectors specialize, each encoding a different aspect of a word. We can interpret a sense vector by inspecting its (non-contextual, linear) projection onto the output space, and intervene on these interpretable hooks to change the model\u2019s behavior in predictable ways. We train a 170M-parameter Backpack language model on OpenWebText, matching the loss of a GPT-2 small (124Mparameter) Transformer. On lexical similarity evaluations, we find that Backpack sense vectors outperform even a 6B-parameter Transformer LM\u2019s word embeddings. Finally, we present simple algorithms that intervene on sense vectors to perform controllable text generation and debiasing. For example, we can edit the sense vocabulary to tend more towards a topic, or localize a source of gender bias to a sense vector and globally suppress that sense.",
+    "authors": [
+      "John Hewitt",
+      "John Thickstun",
+      "Christopher Manning",
+      "Percy Liang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.506",
+    "point2d": [
+      -33.49188232421875,
+      -25.677000045776367
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 508,
+    "title": "WinoQueer: A Community-in-the-Loop Benchmark for Anti-LGBTQ+ Bias in Large Language Models",
+    "abstract": "We present WinoQueer: a benchmark specifically designed to measure whether large language models (LLMs) encode biases that are harmful to the LGBTQ+ community. The benchmark is community-sourced, via application of a novel method that generates a bias benchmark from a community survey. We apply our benchmark to several popular LLMs and find that off-the-shelf models generally do exhibit considerable anti-queer bias. Finally, we show that LLM bias against a marginalized community can be somewhat mitigated by finetuning on data written about or by members of that community, and that social media text written by community members is more effective than news text written about the community by non-members. Our method for community-in-the-loop benchmark development provides a blueprint for future researchers to develop community-driven, harms-grounded LLM benchmarks for other marginalized communities.",
+    "authors": [
+      "Virginia Felkner",
+      "Ho-Chun Herbert Chang",
+      "Eugene Jang",
+      "Jonathan May"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.507",
+    "point2d": [
+      21.54349708557129,
+      29.395801544189453
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 509,
+    "title": "Grounded Multimodal Named Entity Recognition on Social Media",
+    "abstract": "In recent years, Multimodal Named Entity Recognition (MNER) on social media has attracted considerable attention. However, existing MNER studies only extract entity-type pairs in text, which is useless for multimodal knowledge graph construction and insufficient for entity disambiguation. To solve these issues, in this work, we introduce a Grounded Multimodal Named Entity Recognition (GMNER) task. Given a text-image social post, GMNER aims to identify the named entities in text, their entity types, and their bounding box groundings in image (i.e. visual regions). To tackle the GMNER task, we construct a Twitter dataset based on two existing MNER datasets. Moreover, we extend four well-known MNER methods to establish a number of baseline systems and further propose a Hierarchical Index generation framework named H-Index, which generates the entity-type-region triples in a hierarchical manner with a sequence-to-sequence model. Experiment results on our annotated dataset demonstrate the superiority of our H-Index framework over baseline systems on the GMNER task.",
+    "authors": [
+      "Jianfei Yu",
+      "Ziyan Li",
+      "Jieming Wang",
+      "Rui Xia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.508",
+    "point2d": [
+      33.0062255859375,
+      -79.85088348388672
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 510,
+    "title": "Preserving Commonsense Knowledge from Pre-trained Language Models via Causal Inference",
+    "abstract": "Fine-tuning has been proven to be a simple and effective technique to transfer the learned knowledge of Pre-trained Language Models (PLMs) to downstream tasks. However, vanilla fine-tuning easily overfits the target data and degrades the generalization ability. Most existing studies attribute it to catastrophic forgetting, and they retain the pre-trained knowledge indiscriminately without identifying what knowledge is transferable. Motivated by this, we frame fine-tuning into a causal graph and discover that the crux of catastrophic forgetting lies in the missing causal effects from the pre-trained data. Based on the causal view, we propose a unified objective for fine-tuning to retrieve the causality back. Intriguingly, the unified objective can be seen as the sum of the vanilla fine-tuning objective, which learns new knowledge from target data, and the causal objective, which preserves old knowledge from PLMs. Therefore, our method is flexible and can mitigate negative transfer while preserving knowledge. Since endowing models with commonsense is a long-standing challenge, we implement our method on commonsense QA with a proposed heuristic estimation to verify its effectiveness. In the experiments, our method outperforms state-of-the-art fine-tuning methods on all six commonsense QA datasets and can be implemented as a plug-in module to inflate the performance of existing QA models.",
+    "authors": [
+      "Junhao Zheng",
+      "Qianli Ma",
+      "Shengjie Qiu",
+      "Yue Wu",
+      "Peitian Ma",
+      "Junlong Liu",
+      "Huawen Feng",
+      "Xichen Shang",
+      "Haibin Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.509",
+    "point2d": [
+      -29.997793197631836,
+      -21.673309326171875
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 511,
+    "title": "Translation-Enhanced Multilingual Text-to-Image Generation",
+    "abstract": "Research on text-to-image generation (TTI) still predominantly focuses on the English language due to the lack of annotated image-caption data in other languages; in the long run, this might widen inequitable access to TTI technology. In this work, we thus investigate multilingual TTI (termed mTTI) and the current potential of neural machine translation (NMT) to bootstrap mTTI systems. We provide two key contributions. 1) Relying on a multilingual multi-modal encoder, we provide a systematic empirical study of standard methods used in cross-lingual NLP when applied to mTTI: Translate Train, Translate Test, and Zero-Shot Transfer. 2) We propose Ensemble Adapter (EnsAd), a novel parameter-efficient approach that learns to weigh and consolidate the multilingual text knowledge within the mTTI framework, mitigating the language gap and thus improving mTTI performance. Our evaluations on standard mTTI datasets COCO-CN, Multi30K Task2, and LAION-5B demonstrate the potential of translation-enhanced mTTI systems and also validate the benefits of the proposed EnsAd which derives consistent gains across all datasets. Further investigations on model variants, ablation studies, and qualitative analyses provide additional insights on the inner workings of the proposed mTTI approaches.",
+    "authors": [
+      "Yaoyiran Li",
+      "Ching-Yun Chang",
+      "Stephen Rawls",
+      "Ivan Vuli\u0107",
+      "Anna Korhonen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.510",
+    "point2d": [
+      -66.5540542602539,
+      41.438045501708984
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 512,
+    "title": "Benchmarking Large Language Model Capabilities for Conditional Generation",
+    "abstract": "Pre-trained large language models (PLMs) underly most new developments in natural language processing. They have shifted the field from application-specific model pipelines to a single model that is adapted to a wide range of tasks. Autoregressive PLMs like GPT-3 or PaLM and associated techniques like fewshot learning, have additionally shifted the output modality to generation instead of classification or regression. Despite their ubiquitous use, the generation quality of language models is rarely evaluated when these models are introduced. Additionally, it is unclear how existing generation tasks\u2013while they can be used to compare systems at a high level\u2013relate to the real world use cases for which people have been adopting them. In this work, we discuss how to adapt existing application-specific generation benchmarks to PLMs and provide an in-depth, empirical study of the limitations and capabilities of PLMs in natural language generation tasks along dimensions such as scale, architecture, input and output language. Our results show that PLMs differ in their applicability to different data regimes and their generalization to multiple languages. They further inform practitioners as to which PLMs to use for a given generation task setup. We share best practices to be taken into consideration when benchmarking generation capabilities during the development of upcoming PLMs.",
+    "authors": [
+      "Joshua Maynez",
+      "Priyanka Agrawal",
+      "Sebastian Gehrmann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.511",
+    "point2d": [
+      -22.16875648498535,
+      13.63137435913086
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 513,
+    "title": "lilGym: Natural Language Visual Reasoning with Reinforcement Learning",
+    "abstract": "We present lilGym, a new benchmark for language-conditioned reinforcement learning in visual environments. lilGym is based on 2,661 highly-compositional human-written natural language statements grounded in an interactive visual environment. We introduce a new approach for exact reward computation in every possible world state by annotating all statements with executable Python programs. Each statement is paired with multiple start states and reward functions to form thousands of distinct Markov Decision Processes of varying difficulty. We experiment with lilGym with different models and learning regimes. Our results and analysis show that while existing methods are able to achieve non-trivial performance, lilGym forms a challenging open problem. lilGym is available at https://lil.nlp.cornell.edu/lilgym/.",
+    "authors": [
+      "Anne Wu",
+      "Kiante Brantley",
+      "Noriyuki Kojima",
+      "Yoav Artzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.512",
+    "point2d": [
+      55.55796813964844,
+      -7.840487480163574
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 514,
+    "title": "Unsupervised Melody-to-Lyrics Generation",
+    "abstract": "Automatic melody-to-lyric generation is a task in which song lyrics are generated to go with a given melody. It is of significant practical interest and more challenging than unconstrained lyric generation as the music imposes additional constraints onto the lyrics. The training data is limited as most songs are copyrighted, resulting in models that underfit the complicated cross-modal relationship between melody and lyrics. In this work, we propose a method for generating high-quality lyrics without training on any aligned melody-lyric data. Specifically, we design a hierarchical lyric generation framework that first generates a song outline and second the complete lyrics. The framework enables disentanglement of training (based purely on text) from inference (melody-guided text generation) to circumvent the shortage of parallel data.We leverage the segmentation and rhythm alignment between melody and lyrics to compile the given melody into decoding constraints as guidance during inference. The two-step hierarchical design also enables content control via the lyric outline, a much-desired feature for democratizing collaborative song creation. Experimental results show that our model can generate high-quality lyrics that are more on-topic, singable, intelligible, and coherent than strong baselines, for example SongMASS, a SOTA model trained on a parallel dataset, with a 24% relative overall quality improvement based on human ratings. Our code is available at https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation.",
+    "authors": [
+      "Yufei Tian",
+      "Anjali Narayan-Chen",
+      "Shereen Oraby",
+      "Alessandra Cervone",
+      "Gunnar Sigurdsson",
+      "Chenyang Tao",
+      "Wenbo Zhao",
+      "Tagyoung Chung",
+      "Jing Huang",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.513",
+    "point2d": [
+      -33.271080017089844,
+      42.70041275024414
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 515,
+    "title": "Causality-aware Concept Extraction based on Knowledge-guided Prompting",
+    "abstract": "Concepts benefit natural language understanding but are far from complete in existing knowledge graphs (KGs). Recently, pre-trained language models (PLMs) have been widely used in text-based concept extraction (CE). However, PLMs tend to mine the co-occurrence associations from massive corpus as pre-trained knowledge rather than the real causal effect between tokens. As a result, the pre-trained knowledge confounds PLMs to extract biased concepts based on spurious co-occurrence correlations, inevitably resulting in low precision. In this paper, through the lens of a Structural Causal Model (SCM), we propose equipping the PLM-based extractor with a knowledge-guided prompt as an intervention to alleviate concept bias. The prompt adopts the topic of the given entity from the existing knowledge in KGs to mitigate the spurious co-occurrence correlations between entities and biased concepts. Our extensive experiments on representative multilingual KG datasets justify that our proposed prompt can effectively alleviate concept bias and improve the performance of PLM-based CE models.",
+    "authors": [
+      "Siyu Yuan",
+      "Deqing Yang",
+      "Jinxi Liu",
+      "Shuyu Tian",
+      "Jiaqing Liang",
+      "Yanghua Xiao",
+      "Rui Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.514",
+    "point2d": [
+      50.427215576171875,
+      -60.341217041015625
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 516,
+    "title": "Span-level Aspect-based Sentiment Analysis via Table Filling",
+    "abstract": "In this paper, we propose a novel span-level model for Aspect-Based Sentiment Analysis (ABSA), which aims at identifying the sentiment polarity of the given aspect. In contrast to conventional ABSA models that focus on modeling the word-level dependencies between an aspect and its corresponding opinion expressions, in this paper, we propose Table Filling BERT (TF-BERT), which considers the consistency of multi-word opinion expressions at the span-level. Specially, we learn the span representations with a table filling method, by constructing an upper triangular table for each sentiment polarity, of which the elements represent the sentiment intensity of the specific sentiment polarity for all spans in the sentence. Two methods are then proposed, including table-decoding and table-aggregation, to filter out target spans or aggregate each table for sentiment polarity classification. In addition, we design a sentiment consistency regularizer to guarantee the sentiment consistency of each span for different sentiment polarities. Experimental results on three benchmarks demonstrate the effectiveness of our proposed model.",
+    "authors": [
+      "Mao Zhang",
+      "Yongxin Zhu",
+      "Zhen Liu",
+      "Zhimin Bao",
+      "Yunfei Wu",
+      "Xing Sun",
+      "Linli Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.515",
+    "point2d": [
+      17.76737403869629,
+      -34.940792083740234
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 517,
+    "title": "Limitations of Language Models in Arithmetic and Symbolic Induction",
+    "abstract": "Recent work has shown that large pretrained Language Models (LMs) can not only perform remarkably well on a range of Natural Language Processing (NLP) tasks but also start improving on reasoning tasks such as arithmetic induction, symbolic manipulation, and commonsense reasoning with increasing size of models. However, it is still unclear what the underlying capabilities of these LMs are. Surprisingly, we find that these models have limitations on certain basic symbolic manipulation tasks such as copy, reverse, and addition. When the total number of symbols or repeating symbols increases, the model performance drops quickly. We investigate the potential causes behind this phenomenon and examine a set of possible methods, including explicit positional markers, fine-grained computation steps, and LMs with callable programs. Experimental results show that none of these techniques can solve the simplest addition induction problem completely. In the end, we introduce LMs with tutor, which demonstrates every single step of teaching. LMs with tutor is able to deliver 100% accuracy in situations of OOD and repeating symbols, shedding new insights on the boundary of large LMs in induction.",
+    "authors": [
+      "Jing Qian",
+      "Hong Wang",
+      "Zekun Li",
+      "Shiyang Li",
+      "Xifeng Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.516",
+    "point2d": [
+      41.47052764892578,
+      -18.516324996948242
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 518,
+    "title": "EEL: Efficiently Encoding Lattices for Reranking",
+    "abstract": "Standard decoding approaches for conditional text generation tasks typically search for an output hypothesis with high model probability, but this may not yield the best hypothesis according to human judgments of quality. Reranking to optimize for \u201cdownstream\u201d metrics can more closely optimize for quality, but many metrics of interest are computed with pre-trained language models, which are slow to apply to large numbers of hypotheses. We explore an approach for reranking hypotheses by using Transformers to efficiently encode lattices of generated outputs, a method we call EEL. With a single Transformer pass over the entire lattice, we can approximately compute a contextualized representation of each token as if it were only part of a single hypothesis in isolation. We combine this approach with a new class of token-factored rerankers (TFRs) that allow for efficient extraction of high reranker-scoring hypotheses from the lattice. Empirically, our approach incurs minimal degradation error compared to the exponentially slower approach of encoding each hypothesis individually. When applying EEL with TFRs across three text generation tasks, our results show both substantial speedup compared to naive reranking and often better performance on downstream metrics than comparable approaches.",
+    "authors": [
+      "Prasann Singhal",
+      "Jiacheng Xu",
+      "Xi Ye",
+      "Greg Durrett"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.517",
+    "point2d": [
+      -27.723310470581055,
+      18.54514503479004
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 519,
+    "title": "CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-Training",
+    "abstract": "Improving text representation has attracted much attention to achieve expressive text-to-speech (TTS). However, existing works only implicitly learn the prosody with masked token reconstruction tasks, which leads to low training efficiency and difficulty in prosody modeling. We propose CLAPSpeech, a cross-modal contrastive pre-training framework that learns from the prosody variance of the same text token under different contexts. Specifically, 1) with the design of a text encoder and a prosody encoder, we encourage the model to connect the text context with its corresponding prosody pattern in the joint multi-modal space; 2) we introduce a multi-scale pre-training pipeline to capture prosody patterns in multiple levels. 3) we show how to incorporate CLAPSpeech into existing TTS models for better prosody. Experiments on three datasets not only show that CLAPSpeech could improve the prosody prediction for existing TTS methods, but also demonstrate its generalization ability to adapt to multiple languages and multi-speaker text-to-speech. We also deeply analyze the principle behind the performance of CLAPSpeech. Ablation studies demonstrate the necessity of each component in CLAPSpeech. Source code and audio samples are available at https://clapspeech.github.io.",
+    "authors": [
+      "Zhenhui Ye",
+      "Rongjie Huang",
+      "Yi Ren",
+      "Ziyue Jiang",
+      "Jinglin Liu",
+      "Jinzheng He",
+      "Xiang Yin",
+      "Zhou Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.518",
+    "point2d": [
+      -69.15447998046875,
+      26.60032081604004
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 520,
+    "title": "Revisiting Cross-Lingual Summarization: A Corpus-based Study and A New Benchmark with Improved Annotation",
+    "abstract": "Most existing cross-lingual summarization (CLS) work constructs CLS corpora by simply and directly translating pre-annotated summaries from one language to another, which can contain errors from both summarization and translation processes.To address this issue, we propose ConvSumX, a cross-lingual conversation summarization benchmark, through a new annotation schema that explicitly considers source input context.ConvSumX consists of 2 sub-tasks under different real-world scenarios, with each covering 3 language directions.We conduct thorough analysis on ConvSumX and 3 widely-used manually annotated CLS corpora and empirically find that ConvSumX is more faithful towards input text.Additionally, based on the same intuition, we propose a 2-Step method, which takes both conversation and summary as input to simulate human annotation process.Experimental results show that 2-Step method surpasses strong baselines on ConvSumX under both automatic and human evaluation.Analysis shows that both source input text and summary are crucial for modeling cross-lingual summaries.",
+    "authors": [
+      "Yulong Chen",
+      "Huajian Zhang",
+      "Yijie Zhou",
+      "Xuefeng Bai",
+      "Yueguan Wang",
+      "Ming Zhong",
+      "Jianhao Yan",
+      "Yafu Li",
+      "Judy Li",
+      "Xianchao Zhu",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.519",
+    "point2d": [
+      -11.985952377319336,
+      41.00739288330078
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 521,
+    "title": "Learning Dynamic Contextualised Word Embeddings via Template-based Temporal Adaptation",
+    "abstract": "Dynamic contextualised word embeddings (DCWEs) represent the temporal semantic variations of words.We propose a method for learning DCWEs by time-adapting a pretrained Masked Language Model (MLM) using time-sensitive templates.Given two snapshots C_1 and C_2 of a corpus taken respectively at two distinct timestamps T_1 and T_2, we first propose an unsupervised method to select (a) pivot terms related to both C_1 and C_2, and (b) anchor terms that are associated with a specific pivot term in each individual snapshot.We then generate prompts by filling manually compiled templates using the extracted pivot and anchor terms.Moreover, we propose an automatic method to learn time-sensitive templates from C_1 and C_2, without requiring any human supervision.Next, we use the generated prompts to adapt a pretrained MLM to T_2 by fine-tuning using those prompts.Multiple experiments show that our proposed method significantly reduces the perplexity of test sentences in C_2, outperforming the current state-of-the-art.",
+    "authors": [
+      "Xiaohang Tang",
+      "Yi Zhou",
+      "Danushka Bollegala"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.520",
+    "point2d": [
+      2.64310622215271,
+      -42.6472282409668
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 522,
+    "title": "How poor is the stimulus? Evaluating hierarchical generalization in neural networks trained on child-directed speech",
+    "abstract": "When acquiring syntax, children consistently choose hierarchical rules over competing non-hierarchical possibilities. Is this preference due to a learning bias for hierarchical structure, or due to more general biases that interact with hierarchical cues in children\u2019s linguistic input? We explore these possibilities by training LSTMs and Transformers - two types of neural networks without a hierarchical bias - on data similar in quantity and content to children\u2019s linguistic input: text from the CHILDES corpus. We then evaluate what these models have learned about English yes/no questions, a phenomenon for which hierarchical structure is crucial. We find that, though they perform well at capturing the surface statistics of child-directed speech (as measured by perplexity), both model types generalize in a way more consistent with an incorrect linear rule than the correct hierarchical rule. These results suggest that human-like generalization from text alone requires stronger biases than the general sequence-processing biases of standard neural network architectures.",
+    "authors": [
+      "Aditya Yedetore",
+      "Tal Linzen",
+      "Robert Frank",
+      "R. Thomas McCoy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.521",
+    "point2d": [
+      -25.944257736206055,
+      -44.27409744262695
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 523,
+    "title": "GanLM: Encoder-Decoder Pre-training with an Auxiliary Discriminator",
+    "abstract": "Pre-trained models have achieved remarkable success in natural language processing (NLP). However, existing pre-training methods underutilize the benefits of language understanding for generation. Inspired by the idea of Generative Adversarial Networks (GANs), we propose a GAN-style model for encoder-decoder pre-training by introducing an auxiliary discriminator, unifying the ability of language understanding and generation in a single model. Our model, named as GanLM, is trained with two pre-training objectives: replaced token detection and replaced token denoising. Specifically, given masked source sentences, the generator outputs the target distribution and the discriminator predicts whether the target sampled tokens from distribution are incorrect. The target sentence is replaced with misclassified tokens to construct noisy previous context, which is used to generate the gold sentence. In general, both tasks improve the ability of language understanding and generation by selectively using the denoising data. Extensive experiments in language generation benchmarks show that GanLM with the powerful language understanding capability outperforms various strong pre-trained language models (PLMs) and achieves state-of-the-art performance.",
+    "authors": [
+      "Jian Yang",
+      "Shuming Ma",
+      "Li Dong",
+      "Shaohan Huang",
+      "Haoyang Huang",
+      "Yuwei Yin",
+      "Dongdong Zhang",
+      "Liqun Yang",
+      "Furu Wei",
+      "Zhoujun Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.522",
+    "point2d": [
+      -18.76154899597168,
+      0.09260538965463638
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 524,
+    "title": "Log-linear Guardedness and its Implications",
+    "abstract": "Methods for erasing human-interpretable concepts from neural representations that assume linearity have been found to be tractable and useful.However, the impact of this removal on the behavior of downstream classifiers trained on the modified representations is not fully understood.In this work, we formally define the notion of linear guardedness as the inability of an adversary to predict the concept directly from the representation, and study its implications.We show that, in the binary case, under certain assumptions, a downstream log-linear model cannot recover the erased concept.However, we constructively demonstrate that a multiclass log-linear model can be constructed that indirectly recovers the concept in some cases, pointing to the inherent limitations of linear guardedness as a downstream bias mitigation technique.These findings shed light on the theoretical limitations of linear erasure methods and highlight the need for further research on the connections between intrinsic and extrinsic bias in neural models.",
+    "authors": [
+      "Shauli Ravfogel",
+      "Yoav Goldberg",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.523",
+    "point2d": [
+      1.1876875162124634,
+      3.2159361839294434
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 525,
+    "title": "Searching for Needles in a Haystack: On the Role of Incidental Bilingualism in PaLM\u2019s Translation Capability",
+    "abstract": "Large, multilingual language models exhibit surprisingly good zero- or few-shot machine translation capabilities, despite having never seen the intentionally-included translation examples provided to typical neural translation systems. We investigate the role of incidental bilingualism\u2014the unintentional consumption of bilingual signals, including translation examples\u2014in explaining the translation capabilities of large language models, taking the Pathways Language Model (PaLM) as a case study. We introduce a mixed-method approach to measure and understand incidental bilingualism at scale. We show that PaLM is exposed to over 30 million translation pairs across at least 44 languages. Furthermore, the amount of incidental bilingual content is highly correlated with the amount of monolingual in-language content for non-English languages. We relate incidental bilingual content to zero-shot prompts and show that it can be used to mine new prompts to improve PaLM\u2019s out-of-English zero-shot translation quality. Finally, in a series of small-scale ablations, we show that its presence has a substantial impact on translation capabilities, although this impact diminishes with model scale.",
+    "authors": [
+      "Eleftheria Briakou",
+      "Colin Cherry",
+      "George Foster"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.524",
+    "point2d": [
+      -64.33778381347656,
+      -2.837949752807617
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 526,
+    "title": "Open Set Relation Extraction via Unknown-Aware Training",
+    "abstract": "The existing supervised relation extraction methods have achieved impressive performance in a closed-set setting, in which the relations remain the same during both training and testing. In a more realistic open-set setting, unknown relations may appear in the test set. Due to the lack of supervision signals from unknown relations, a well-performing closed-set relation extractor can still confidently misclassify them into known relations. In this paper, we propose an unknown-aware training method, regularizing the model by dynamically synthesizing negative instances that can provide the missing supervision signals. Inspired by text adversarial attack, We adaptively apply small but critical perturbations to original training data,synthesizing difficult enough negative instances that are mistaken by the model as known relations, thus facilitating a compact decision boundary. Experimental results show that our method achieves SOTA unknown relation detection without compromising the classification of known relations.",
+    "authors": [
+      "Jun Zhao",
+      "Xin Zhao",
+      "WenYu Zhan",
+      "Qi Zhang",
+      "Tao Gui",
+      "Zhongyu Wei",
+      "Yun Wen Chen",
+      "Xiang Gao",
+      "Xuanjing Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.525",
+    "point2d": [
+      38.71550369262695,
+      -63.99631881713867
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 527,
+    "title": "Learning to Imagine: Visually-Augmented Natural Language Generation",
+    "abstract": "People often imagine relevant scenes to aid in the writing process. In this work, we aim to utilize visual information for composition in the same manner as humans. We propose a method, LIVE, that makes pre-trained language models (PLMs) Learn to Imagine for Visually-augmented natural language gEneration. First, we imagine the scene based on the text: we use a diffusion model to synthesize high-quality images conditioned on the input texts. Second, we use CLIP to determine whether the text can evoke the imagination in a posterior way. Finally, our imagination is dynamic, and we conduct synthesis for each sentence rather than generate only one image for an entire paragraph. Technically, we propose a novel plug-and-play fusion layer to obtain visually-augmented representations for each text. Our vision-text fusion layer is compatible with Transformer-based architecture. We have conducted extensive experiments on four generation tasks using BART and T5, and the automatic results and human evaluation demonstrate the effectiveness of our proposed method. We will release the code, model, and data at the link: https://github.com/RUCAIBox/LIVE.",
+    "authors": [
+      "Tianyi Tang",
+      "Yushuo Chen",
+      "Yifan Du",
+      "Junyi Li",
+      "Wayne Xin Zhao",
+      "Ji-Rong Wen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.526",
+    "point2d": [
+      -63.42051315307617,
+      50.164371490478516
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 528,
+    "title": "Generating Hashtags for Short-form Videos with Guided Signals",
+    "abstract": "Short-form video hashtag recommendation (SVHR) aims to recommend hashtags to content creators from videos and corresponding descriptions. Most prior studies regard SVHR as a classification or ranking problem and select hashtags from a set of limited candidates. However, in reality, users can create new hashtags, and trending hashtags change rapidly over time on social media. Both of these properties cannot be easily modeled with classification approaches. To bridge this gap, we formulate SVHR as a generation task that better represents how hashtags are created naturally. Additionally, we propose the Guided Generative Model (GGM) where we augment the input features by retrieving relevant hashtags from a large-scale hashtag pool as extra guidance signals. Experimental results on two short-form video datasets show that our generative models outperform strong classification baselines, and the guidance signals further boost the performance by 8.11 and 2.17 absolute ROUGE-1 scores on average, respectively. We also perform extensive analyses including human evaluation, demonstrating that our generative model can create meaningful and relevant novel hashtags while achieving state-of-the-art performance on known hashtags",
+    "authors": [
+      "Tiezheng Yu",
+      "Hanchao Yu",
+      "Davis Liang",
+      "Yuning Mao",
+      "Shaoliang Nie",
+      "Po-Yao Huang",
+      "Madian Khabsa",
+      "Pascale Fung",
+      "Yi-Chia Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.527",
+    "point2d": [
+      2.0748255252838135,
+      32.652198791503906
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 529,
+    "title": "NEUROSTRUCTURAL DECODING: Neural Text Generation with Structural Constraints",
+    "abstract": "Text generation often involves producing coherent and grammatically correct texts that also satisfy a given set of semantic constraints. While most approaches for conditional text generation have primarily focused on lexical constraints, they often struggle to effectively incorporate syntactic constraints, which provide a richer language for approximating semantic constraints.We address this gap by introducing NeuroStructural Decoding, a new decoding algorithm that incorporates syntactic constraints to further improve the quality of the generated text. We build NeuroStructural Decoding on the NeuroLogic Decoding (Lu etal. 2021) algorithm, which enables language generation models to produce fluent text while satisfying complex lexical constraints. Our algorithm is powerful and scalable. It tracks lexico-syntactic constraints (e.g., we need to observe dog as subject and ball as object)during decoding by parsing the partial generations at each step. To this end, we adapt a dependency parser to generate parses for incomplete sentences. Our approach is evaluated on three different language generation tasks, and the results show improved performance in both lexical and syntactic metrics compared to previous methods. The results suggest this is a promising solution for integrating fine-grained controllable generation into the conventional beam search decoding.",
+    "authors": [
+      "Mohaddeseh Bastan",
+      "Mihai Surdeanu",
+      "Niranjan Balasubramanian"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.528",
+    "point2d": [
+      -30.303083419799805,
+      14.180887222290039
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 530,
+    "title": "The Best of Both Worlds: Combining Human and Machine Translations for Multilingual Semantic Parsing with Active Learning",
+    "abstract": "Multilingual semantic parsing aims to leverage the knowledge from the high-resource languages to improve low-resource semantic parsing, yet commonly suffers from the data imbalance problem. Prior works propose to utilize the translations by either humans or machines to alleviate such issues. However, human translations are expensive, while machine translations are cheap but prone to error and bias. In this work, we propose an active learning approach that exploits the strengths of both human and machine translations by iteratively adding small batches of human translations into the machine-translated training set. Besides, we propose novel aggregated acquisition criteria that help our active learning method select utterances to be manually translated. Our experiments demonstrate that an ideal utterance selection can significantly reduce the error and bias in the translated data, resulting in higher parser accuracies than the parsers merely trained on the machine-translated data.",
+    "authors": [
+      "Zhuang Li",
+      "Lizhen Qu",
+      "Philip Cohen",
+      "Raj Tumuluri",
+      "Gholamreza Haffari"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.529",
+    "point2d": [
+      -33.18894958496094,
+      -55.735992431640625
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 531,
+    "title": "Ideology Prediction from Scarce and Biased Supervision: Learn to Disregard the \u201cWhat\u201d and Focus on the \u201cHow\u201d!",
+    "abstract": "We propose a novel supervised learning approach for political ideology prediction (PIP) that is capable of predicting out-of-distribution inputs. This problem is motivated by the fact that manual data-labeling is expensive, while self-reported labels are often scarce and exhibit significant selection bias. We propose a novel statistical model that decomposes the document embeddings into a linear superposition of two vectors; a latent neutral context vector independent of ideology, and a latent position vector aligned with ideology. We train an end-to-end model that has intermediate contextual and positional vectors as outputs. At deployment time, our model predicts labels for input documents by exclusively leveraging the predicted positional vectors. On two benchmark datasets we show that our model is capable of outputting predictions even when trained with as little as 5% biased data, and is significantly more accurate than the state-of-the-art. Through crowd-sourcing we validate the neutrality of contextual vectors, and show that context filtering results in ideological concentration, allowing for prediction on out-of-distribution examples.",
+    "authors": [
+      "Chen Chen",
+      "Dylan Walker",
+      "Venkatesh Saligrama"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.530",
+    "point2d": [
+      26.66671371459961,
+      27.81573486328125
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 532,
+    "title": "Unsupervised Extractive Summarization of Emotion Triggers",
+    "abstract": "Understanding what leads to emotions during large-scale crises is important as it can provide groundings for expressed emotions and subsequently improve the understanding of ongoing disasters. Recent approaches trained supervised models to both detect emotions and explain emotion triggers (events and appraisals) via abstractive summarization. However, obtaining timely and qualitative abstractive summaries is expensive and extremely time-consuming, requiring highly-trained expert annotators. In time-sensitive, high-stake contexts, this can block necessary responses. We instead pursue unsupervised systems that extract triggers from text. First, we introduce CovidET-EXT, augmenting (Zhan et al., 2022)\u2019s abstractive dataset (in the context of the COVID-19 crisis) with extractive triggers. Second, we develop new unsupervised learning models that can jointly detect emotions and summarize their triggers. Our best approach, entitled Emotion-Aware Pagerank, incorporates emotion information from external sources combined with a language understanding module, and outperforms strong baselines. We release our data and code at https://github.com/tsosea2/CovidET-EXT.",
+    "authors": [
+      "Tiberiu Sosea",
+      "Hongli Zhan",
+      "Junyi Jessy Li",
+      "Cornelia Caragea"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.531",
+    "point2d": [
+      40.90541076660156,
+      32.328304290771484
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 533,
+    "title": "Document-Level Event Argument Extraction With a Chain Reasoning Paradigm",
+    "abstract": "Document-level event argument extraction aims to identify event arguments beyond sentence level, where a significant challenge is to model long-range dependencies.Focusing on this challenge, we present a new chain reasoning paradigm for the task, which can generate decomposable first-order logic rules for reasoning.This paradigm naturally captures long-range interdependence due to the chains\u2019 compositional nature, which also improves interpretability by explicitly modeling the reasoning process.We introduce T-norm fuzzy logic for optimization, which permits end-to-end learning and shows promise for integrating the expressiveness of logical reasoning with the generalization of neural networks.In experiments, we show that our approach outperforms previous methods by a significant margin on two standard benchmarks (over 6 points in F1).Moreover, it is data-efficient in low-resource scenarios and robust enough to defend against adversarial attacks.",
+    "authors": [
+      "Jian Liu",
+      "Chen Liang",
+      "Jinan Xu",
+      "Haoyan Liu",
+      "Zhe Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.532",
+    "point2d": [
+      45.96669387817383,
+      -46.67955017089844
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 534,
+    "title": "Pre-training Multi-party Dialogue Models with Latent Discourse Inference",
+    "abstract": "Multi-party dialogues are more difficult for models to understand than one-to-one two-party dialogues, since they involve multiple interlocutors, resulting in interweaving reply-to relations and information flows. To step over these obstacles, an effective way is to pre-train a model that understands the discourse structure of multi-party dialogues, namely, to whom each utterance is replying. However, due to the lack of explicitly annotated discourse labels in multi-party dialogue corpora, previous works fail to scale up the pre-training process by putting aside the unlabeled multi-party conversational data for nothing. To fully utilize the unlabeled data, we propose to treat the discourse structures as latent variables, then jointly infer them and pre-train the discourse-aware model by unsupervised latent variable inference methods. Experiments on multiple downstream tasks show that our pre-trained model outperforms strong baselines by large margins and achieves state-of-the-art (SOTA) results, justifying the effectiveness of our method. The official implementation of this paper is available at https://github.com/EricLee8/MPD_EMVI.",
+    "authors": [
+      "Yiyang Li",
+      "Xinting Huang",
+      "Wei Bi",
+      "Hai Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.533",
+    "point2d": [
+      4.770171642303467,
+      64.8992691040039
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 535,
+    "title": "Interpreting Positional Information in Perspective of Word Order",
+    "abstract": "The attention mechanism is a powerful and effective method utilized in natural language processing. However, it has been observed that this method is insensitive to positional information. Although several studies have attempted to improve positional encoding and investigate the influence of word order perturbation, it remains unclear how positional encoding impacts NLP models from the perspective of word order. In this paper, we aim to shed light on this problem by analyzing the working mechanism of the attention module and investigating the root cause of its inability to encode positional information. Our hypothesis is that the insensitivity can be attributed to the weight sum operation utilized in the attention module. To verify this hypothesis, we propose a novel weight concatenation operation and evaluate its efficacy in neural machine translation tasks. Our enhanced experimental results not only reveal that the proposed operation can effectively encode positional information but also confirm our hypothesis.",
+    "authors": [
+      "Zhang Xilong",
+      "Liu Ruochen",
+      "Liu Jin",
+      "Liang Xuefeng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.534",
+    "point2d": [
+      -55.41045379638672,
+      3.187244415283203
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 536,
+    "title": "I2D2: Inductive Knowledge Distillation with NeuroLogic and Self-Imitation",
+    "abstract": "Commonsense capabilities of pre-trained language models dramatically improve with scale, leading many to believe that scale is the only winning recipe. But is it? Here, we investigate an alternative that a priori seems impossible: can smaller language models (e.g., GPT-2) win over models that are orders of magnitude larger and better (e.g., GPT-3), if powered with novel commonsense distillation algorithms?The key intellectual challenge is to design a learning algorithm that achieve a competitive level of commonsense acquisition, without relying on the benefits of scale. In particular, we study generative models of commonsense knowledge, focusing on the task of generating generics, statements of commonsense facts about everyday concepts, e.g., birds can fly.We introduce I2D2, a novel commonsense distillation framework that loosely follows the Symbolic Knowledge Distillation of West et al. but breaks the dependence on the extreme-scale teacher model with two innovations: (1) the novel adaptation of NeuroLogic Decoding to enhance the generation quality of the weak, off-the-shelf language models, and (2) self-imitation learning to iteratively learn from the model\u2019s own enhanced commonsense acquisition capabilities. Empirical results suggest that scale is not the only way, as novel algorithms can be a promising alternative. Moreover, our study leads to a new corpus of generics, Gen-A-tomic, that is the largest and highest quality available to date.",
+    "authors": [
+      "Chandra Bhagavatula",
+      "Jena D. Hwang",
+      "Doug Downey",
+      "Ronan Le Bras",
+      "Ximing Lu",
+      "Lianhui Qin",
+      "Keisuke Sakaguchi",
+      "Swabha Swayamdipta",
+      "Peter West",
+      "Yejin Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.535",
+    "point2d": [
+      38.034976959228516,
+      -13.547789573669434
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 537,
+    "title": "More than Classification: A Unified Framework for Event Temporal Relation Extraction",
+    "abstract": "Event temporal relation extraction (ETRE) is usually formulated as a multi-label classification task, where each type of relation is simply treated as a one-hot label. This formulation ignores the meaning of relations and wipes out their intrinsic dependency. After examining the relation definitions in various ETRE tasks, we observe that all relations can be interpreted using the start and end time points of events. For example, relation Includes could be interpreted as event 1 starting no later than event 2 and ending no earlier than event 2. In this paper, we propose a unified event temporal relation extraction framework, which transforms temporal relations into logical expressions of time points and completes the ETRE by predicting the relations between certain time point pairs. Experiments on TB-Dense and MATRES show significant improvements over a strong baseline and outperform the state-of-the-art model by 0.3% on both datasets. By representing all relations in a unified framework, we can leverage the relations with sufficient data to assist the learning of other relations, thus achieving stable improvement in low-data scenarios. When the relation definitions are changed, our method can quickly adapt to the new ones by simply modifying the logic expressions that map time points to new event relations. The code is released at https://github.com/AndrewZhe/A-Unified-Framework-for-ETRE",
+    "authors": [
+      "Quzhe Huang",
+      "Yutong Hu",
+      "Shengqi Zhu",
+      "Yansong Feng",
+      "Chang Liu",
+      "Dongyan Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.536",
+    "point2d": [
+      49.81592559814453,
+      -50.89958572387695
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 538,
+    "title": "Multi-Source Test-Time Adaptation as Dueling Bandits for Extractive Question Answering",
+    "abstract": "In this work, we study multi-source test-time model adaptation from user feedback, where K distinct models are established for adaptation. To allow efficient adaptation, we cast the problem as a stochastic decision-making process, aiming to determine the best adapted model after adaptation. We discuss two frameworks: multi-armed bandit learning and multi-armed dueling bandits. Compared to multi-armed bandit learning, the dueling framework allows pairwise collaboration among K models, which is solved by a novel method named Co-UCB proposed in this work. Experiments on six datasets of extractive question answering (QA) show that the dueling framework using Co-UCB is more effective than other strong baselines for our studied problem.",
+    "authors": [
+      "Hai Ye",
+      "Qizhe Xie",
+      "Hwee Tou Ng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.537",
+    "point2d": [
+      60.3740348815918,
+      11.16822338104248
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 539,
+    "title": "Decoupling Pseudo Label Disambiguation and Representation Learning for Generalized Intent Discovery",
+    "abstract": "Generalized intent discovery aims to extend a closed-set in-domain intent classifier to an open-world intent set including in-domain and out-of-domain intents. The key challenges lie in pseudo label disambiguation and representation learning. Previous methods suffer from a coupling of pseudo label disambiguation and representation learning, that is, the reliability of pseudo labels relies on representation learning, and representation learning is restricted by pseudo labels in turn. In this paper, we propose a decoupled prototype learning framework (DPL) to decouple pseudo label disambiguation and representation learning. Specifically, we firstly introduce prototypical contrastive representation learning (PCL) to get discriminative representations. And then we adopt a prototype-based label disambiguation method (PLD) to obtain pseudo labels. We theoretically prove that PCL and PLD work in a collaborative fashion and facilitate pseudo label disambiguation. Experiments and analysis on three benchmark datasets show the effectiveness of our method.",
+    "authors": [
+      "Yutao Mou",
+      "Xiaoshuai Song",
+      "Keqing He",
+      "Chen Zeng",
+      "Pei Wang",
+      "Jingang Wang",
+      "Yunsen Xian",
+      "Weiran Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.538",
+    "point2d": [
+      -8.017022132873535,
+      76.23906707763672
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 540,
+    "title": "DecompEval: Evaluating Generated Texts as Unsupervised Decomposed Question Answering",
+    "abstract": "Existing evaluation metrics for natural language generation (NLG) tasks face the challenges on generalization ability and interpretability. Specifically, most of the well-performed metrics are required to train on evaluation datasets of specific NLG tasks and evaluation dimensions, which may cause over-fitting to task-specific datasets. Furthermore, existing metrics only provide an evaluation score for each dimension without revealing the evidence to interpret how this score is obtained. To deal with these challenges, we propose a simple yet effective metric called DecompEval. This metric formulates NLG evaluation as an instruction-style question answering task and utilizes instruction-tuned pre-trained language models (PLMs) without training on evaluation datasets, aiming to enhance the generalization ability. To make the evaluation process more interpretable, we decompose our devised instruction-style question about the quality of generated texts into the subquestions that measure the quality of each sentence. The subquestions with their answers generated by PLMs are then recomposed as evidence to obtain the evaluation result. Experimental results show that DecompEval achieves state-of-the-art performance in untrained metrics for evaluating text summarization and dialogue generation, which also exhibits strong dimension-level / task-level generalization ability and interpretability.",
+    "authors": [
+      "Pei Ke",
+      "Fei Huang",
+      "Fei Mi",
+      "Yasheng Wang",
+      "Qun Liu",
+      "Xiaoyan Zhu",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.539",
+    "point2d": [
+      55.55033493041992,
+      16.719636917114258
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 541,
+    "title": "Backdooring Neural Code Search",
+    "abstract": "Reusing off-the-shelf code snippets from online repositories is a common practice, which significantly enhances the productivity of software developers. To find desired code snippets, developers resort to code search engines through natural language queries. Neural code search models are hence behind many such engines. These models are based on deep learning and gain substantial attention due to their impressive performance. However, the security aspect of these models is rarely studied. Particularly, an adversary can inject a backdoor in neural code search models, which return buggy or even vulnerable code with security/privacy issues. This may impact the downstream software (e.g., stock trading systems and autonomous driving) and cause financial loss and/or life-threatening incidents. In this paper, we demonstrate such attacks are feasible and can be quite stealthy. By simply modifying one variable/function name, the attacker can make buggy/vulnerable code rank in the top 11%. Our attack BADCODE features a special trigger generation and injection procedure, making the attack more effective and stealthy. The evaluation is conducted on two neural code search models and the results show our attack outperforms baselines by 60%. Our user study demonstrates that our attack is more stealthy than the baseline by two times based on the F1 score.",
+    "authors": [
+      "Weisong Sun",
+      "Yuchen Chen",
+      "Guanhong Tao",
+      "Chunrong Fang",
+      "Xiangyu Zhang",
+      "Quanjun Zhang",
+      "Bin Luo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.540",
+    "point2d": [
+      -6.164587020874023,
+      -59.35738754272461
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 542,
+    "title": "Concise Answers to Complex Questions: Summarization of Long-form Answers",
+    "abstract": "Long-form question answering systems provide rich information by presenting paragraph-level answers, often containing optional background or auxiliary information. While such comprehensive answers are helpful, not all information is required to answer the question (e.g. users with domain knowledge do not need an explanation of background). Can we provide a concise version of the answer by summarizing it, while still addressing the question? We conduct a user study on summarized answers generated from state-of-the-art models and our newly proposed extract-and-decontextualize approach. We find a large proportion of long-form answers (over 90%) in the ELI5 domain can be adequately summarized by at least one system, while complex and implicit answers are challenging to compress. We observe that decontextualization improves the quality of the extractive summary, exemplifying its potential in the summarization task. To promote future work, we provide an extractive summarization dataset covering 1K long-form answers and our user study annotations. Together, we present the first study on summarizing long-form answers, taking a step forward for QA agents that can provide answers at multiple granularities.",
+    "authors": [
+      "Abhilash Potluri",
+      "Fangyuan Xu",
+      "Eunsol Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.541",
+    "point2d": [
+      67.20947265625,
+      10.624093055725098
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 543,
+    "title": "Towards Better Entity Linking with Multi-View Enhanced Distillation",
+    "abstract": "Dense retrieval is widely used for entity linking to retrieve entities from large-scale knowledge bases. Mainstream techniques are based on a dual-encoder framework, which encodes mentions and entities independently and calculates their relevances via rough interaction metrics, resulting in difficulty in explicitly modeling multiple mention-relevant parts within entities to match divergent mentions. Aiming at learning entity representations that can match divergent mentions, this paper proposes a Multi-View Enhanced Distillation (MVD) framework, which can effectively transfer knowledge of multiple fine-grained and mention-relevant parts within entities from cross-encoders to dual-encoders. Each entity is split into multiple views to avoid irrelevant information being over-squashed into the mention-relevant view. We further design cross-alignment and self-alignment mechanisms for this framework to facilitate fine-grained knowledge distillation from the teacher model to the student model. Meanwhile, we reserve a global-view that embeds the entity as a whole to prevent dispersal of uniform information. Experiments show our method achieves state-of-the-art performance on several entity linking benchmarks.",
+    "authors": [
+      "Yi Liu",
+      "Yuan Tian",
+      "Jianxun Lian",
+      "Xinlong Wang",
+      "Yanan Cao",
+      "Fang Fang",
+      "Wen Zhang",
+      "Haizhen Huang",
+      "Weiwei Deng",
+      "Qi Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.542",
+    "point2d": [
+      41.86830520629883,
+      -65.59893035888672
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 544,
+    "title": "A Measure-Theoretic Characterization of Tight Language Models",
+    "abstract": "Language modeling, a central task in natural language processing, involves estimating a probability distribution over strings. In most cases, the estimated distribution sums to 1 over all finite strings. However, in some pathological cases, probability mass can \u201cleak\u201d onto the set of infinite sequences. In order to characterize the notion of leakage more precisely, this paper offers a measure-theoretic treatment of language modeling. We prove that many popular language model families are in fact tight, meaning that they will not leak in this sense. We also generalize characterizations of tightness proposed in previous works.",
+    "authors": [
+      "Li Du",
+      "Lucas Torroba Hennigen",
+      "Tiago Pimentel",
+      "Clara Meister",
+      "Jason Eisner",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.543",
+    "point2d": [
+      -4.230434417724609,
+      11.311310768127441
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 545,
+    "title": "PAED: Zero-Shot Persona Attribute Extraction in Dialogues",
+    "abstract": "Persona attribute extraction is critical for personalized human-computer interaction. Dialogue is an important medium that communicates and delivers persona information. Although there is a public dataset for triplet-based persona attribute extraction from conversations, its automatically generated labels present many issues, including unspecific relations and inconsistent annotations. We fix such issues by leveraging more reliable text-label matching criteria to generate high-quality data for persona attribute extraction. We also propose a contrastive learning- and generation-based model with a novel hard negative sampling strategy for generalized zero-shot persona attribute extraction. We benchmark our model with state-of-the-art baselines on our dataset and a public dataset, showing outstanding accuracy gains. Our sampling strategy also exceeds others by a large margin in persona attribute extraction.",
+    "authors": [
+      "Luyao Zhu",
+      "Wei Li",
+      "Rui Mao",
+      "Vlad Pandelea",
+      "Erik Cambria"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.544",
+    "point2d": [
+      13.180245399475098,
+      73.125732421875
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 546,
+    "title": "PromptRank: Unsupervised Keyphrase Extraction Using Prompt",
+    "abstract": "The keyphrase extraction task refers to the automatic selection of phrases from a given document to summarize its core content. State-of-the-art (SOTA) performance has recently been achieved by embedding-based algorithms, which rank candidates according to how similar their embeddings are to document embeddings. However, such solutions either struggle with the document and candidate length discrepancies or fail to fully utilize the pre-trained language model (PLM) without further fine-tuning. To this end, in this paper, we propose a simple yet effective unsupervised approach, PromptRank, based on the PLM with an encoder-decoder architecture. Specifically, PromptRank feeds the document into the encoder and calculates the probability of generating the candidate with a designed prompt by the decoder. We extensively evaluate the proposed PromptRank on six widely used benchmarks. PromptRank outperforms the SOTA approach MDERank, improving the F1 score relatively by 34.18%, 24.87%, and 17.57% for 5, 10, and 15 returned results, respectively. This demonstrates the great potential of using prompt for unsupervised keyphrase extraction. We release our code at https://github.com/HLT-NLP/PromptRank.",
+    "authors": [
+      "Aobo Kong",
+      "Shiwan Zhao",
+      "Hao Chen",
+      "Qicheng Li",
+      "Yong Qin",
+      "Ruiqi Sun",
+      "Xiaoyan Bai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.545",
+    "point2d": [
+      -20.595211029052734,
+      29.59922981262207
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 547,
+    "title": "When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories",
+    "abstract": "Despite their impressive performance on diverse tasks, large language models (LMs) still struggle with tasks requiring rich world knowledge, implying the difficulty of encoding a wealth of world knowledge in their parameters. This paper aims to understand LMs\u2019 strengths and limitations in memorizing factual knowledge, by conducting large-scale knowledge probing experiments on two open-domain entity-centric QA datasets: PopQA, our new dataset with 14k questions about long-tail entities, and EntityQuestions, a widely used open-domain QA dataset. We find that LMs struggle with less popular factual knowledge, and that retrieval augmentation helps significantly in these cases. Scaling, on the other hand, mainly improves memorization of popular knowledge, and fails to appreciably improve memorization of factual knowledge in the tail. Based on those findings, we devise a new method for retrieval-augmentation that improves performance and reduces inference costs by only retrieving non-parametric memories when necessary.",
+    "authors": [
+      "Alex Mallen",
+      "Akari Asai",
+      "Victor Zhong",
+      "Rajarshi Das",
+      "Daniel Khashabi",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.546",
+    "point2d": [
+      45.46799087524414,
+      1.4096671342849731
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 548,
+    "title": "infoVerse: A Universal Framework for Dataset Characterization with Multidimensional Meta-information",
+    "abstract": "The success of NLP systems often relies on the availability of large, high-quality datasets. However, not all samples in these datasets are equally valuable for learning, as some may be redundant or noisy. Several methods for characterizing datasets based on model-driven meta-information (e.g., model\u2019s confidence) have been developed, but the relationship and complementary effects of these methods have received less attention. In this paper, we introduce infoVerse, a universal framework for dataset characterization, which provides a new feature space that effectively captures multidimensional characteristics of datasets by incorporating various model-driven meta-information. infoVerse reveals distinctive regions of the dataset that are not apparent in the original semantic space, hence guiding users (or models) in identifying which samples to focus on for exploration, assessment, or annotation. Additionally, we propose a novel sampling method on infoVerse to select a set of data points that maximizes informativeness. In three real-world applications (data pruning, active learning, and data annotation), the samples chosen on infoVerse space consistently outperform strong baselines in all applications. Our code and demo are publicly available.",
+    "authors": [
+      "Jaehyung Kim",
+      "Yekyung Kim",
+      "Karin de Langis",
+      "Jinwoo Shin",
+      "Dongyeop Kang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.547",
+    "point2d": [
+      -9.214640617370605,
+      -25.718196868896484
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 549,
+    "title": "SeeGULL: A Stereotype Benchmark with Broad Geo-Cultural Coverage Leveraging Generative Models",
+    "abstract": "Stereotype benchmark datasets are crucial to detect and mitigate social stereotypes about groups of people in NLP models. However, existing datasets are limited in size and coverage, and are largely restricted to stereotypes prevalent in the Western society. This is especially problematic as language technologies gain hold across the globe. To address this gap, we present SeeGULL, a broad-coverage stereotype dataset, built by utilizing generative capabilities of large language models such as PaLM, and GPT-3, and leveraging a globally diverse rater pool to validate the prevalence of those stereotypes in society. SeeGULL is in English, and contains stereotypes about identity groups spanning 178 countries across 8 different geo-political regions across 6 continents, as well as state-level identities within the US and India. We also include fine-grained offensiveness scores for different stereotypes and demonstrate their global disparities. Furthermore, we include comparative annotations about the same groups by annotators living in the region vs. those that are based in North America, and demonstrate that within-region stereotypes about groups differ from those prevalent in North America.",
+    "authors": [
+      "Akshita Jha",
+      "Aida Mostafazadeh Davani",
+      "Chandan K Reddy",
+      "Shachi Dave",
+      "Vinodkumar Prabhakaran",
+      "Sunipa Dev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.548",
+    "point2d": [
+      24.39274024963379,
+      32.619903564453125
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 550,
+    "title": "Automated Metrics for Medical Multi-Document Summarization Disagree with Human Evaluations",
+    "abstract": "Evaluating multi-document summarization (MDS) quality is difficult. This is especially true in the case of MDS for biomedical literature reviews, where models must synthesize contradicting evidence reported across different documents. Prior work has shown that rather than performing the task, models may exploit shortcuts that are difficult to detect using standard n-gram similarity metrics such as ROUGE. Better automated evaluation metrics are needed, but few resources exist to assess metrics when they are proposed. Therefore, we introduce a dataset of human-assessed summary quality facets and pairwise preferences to encourage and support the development of better automated evaluation methods for literature review MDS. We take advantage of community submissions to the Multi-document Summarization for Literature Review (MSLR) shared task to compile a diverse and representative sample of generated summaries. We analyze how automated summarization evaluation metrics correlate with lexical features of generated summaries, to other automated metrics including several we propose in this work, and to aspects of human-assessed summary quality. We find that not only do automated metrics fail to capture aspects of quality as assessed by humans, in many cases the system rankings produced by these metrics are anti-correlated with rankings according to human annotators.",
+    "authors": [
+      "Lucy Lu Wang",
+      "Yulia Otmakhova",
+      "Jay DeYoung",
+      "Thinh Hung Truong",
+      "Bailey Kuehl",
+      "Erin Bransom",
+      "Byron Wallace"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.549",
+    "point2d": [
+      -6.372720241546631,
+      33.78615951538086
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 551,
+    "title": "Say What You Mean! Large Language Models Speak Too Positively about Negative Commonsense Knowledge",
+    "abstract": "Large language models (LLMs) have been widely studied for their ability to store and utilize positive knowledge. However, negative knowledge, such as \u201clions don\u2019t live in the ocean\u201d, is also ubiquitous in the world but rarely mentioned explicitly in text.What do LLMs know about negative knowledge?This work examines the ability of LLMs on negative commonsense knowledge.We design a constrained keywords-to-sentence generation task (CG) and a Boolean question answering task (QA) to probe LLMs.Our experiments reveal that LLMs frequently fail to generate valid sentences grounded in negative commonsense knowledge, yet they can correctly answer polar yes-or-no questions.We term this phenomenon the belief conflict of LLMs.Our further analysis shows that statistical shortcuts and negation reporting bias from language modeling pre-training cause this conflict.",
+    "authors": [
+      "Jiangjie Chen",
+      "Wei Shi",
+      "Ziquan Fu",
+      "Sijie Cheng",
+      "Lei Li",
+      "Yanghua Xiao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.550",
+    "point2d": [
+      40.297821044921875,
+      -5.759456634521484
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 552,
+    "title": "An Inner Table Retriever for Robust Table Question Answering",
+    "abstract": "Recent years have witnessed the thriving of pretrained Transformer-based language models for understanding semi-structured tables, with several applications, such as Table Question Answering (TableQA).These models are typically trained on joint tables and surrounding natural language text, by linearizing table content into sequences comprising special tokens and cell information. This yields very long sequences which increase system inefficiency, and moreover, simply truncating long sequences results in information loss for downstream tasks. We propose Inner Table Retriever (ITR), a general-purpose approach for handling long tables in TableQA that extracts sub-tables to preserve the most relevant information for a question.We show that ITR can be easily integrated into existing systems to improve their accuracy with up to 1.3-4.8% and achieve state-of-the-art results in two benchmarks, i.e., 63.4% in WikiTableQuestions and 92.1% in WikiSQL. Additionally, we show that ITR makes TableQA systems more robust to reduced model capacity and to different ordering of columns and rows. We make our code available at: https://github.com/amazon-science/robust-tableqa.",
+    "authors": [
+      "Weizhe Lin",
+      "Rexhina Blloshmi",
+      "Bill Byrne",
+      "Adria de Gispert",
+      "Gonzalo Iglesias"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.551",
+    "point2d": [
+      77.38143157958984,
+      6.563469886779785
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 553,
+    "title": "SIMSUM: Document-level Text Simplification via Simultaneous Summarization",
+    "abstract": "Document-level text simplification is a specific type of simplification which involves simplifying documents consisting of several sentences by rewriting them into fewer or more sentences. In this paper, we propose a new two-stage framework SIMSUM for automated document-level text simplification. Our model is designed with explicit summarization and simplification models and guides the generation using the main keywords of a source text.In order to evaluate our new model, we use two existing benchmark datasets for simplification, namely D-Wikipedia and Wiki-Doc. We compare our model\u2019s performance with state of the art and show that SIMSUM achieves top results on the D-Wikipedia dataset SARI (+1.20), D-SARI (+1.64), and FKGL (-0.35) scores, improving over the best baseline models. In order to evaluate the quality of the generated text, we analyze the outputs from different models qualitatively and demonstrate the merit of our new model. Our code and datasets are available.",
+    "authors": [
+      "Sofia Blinova",
+      "Xinyu Zhou",
+      "Martin Jaggi",
+      "Carsten Eickhoff",
+      "Seyed Ali Bahrainian"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.552",
+    "point2d": [
+      -29.812957763671875,
+      30.0782527923584
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 554,
+    "title": "SimOAP: Improve Coherence and Consistency in Persona-based Dialogue Generation via Over-sampling and Post-evaluation",
+    "abstract": "Language models trained on large-scale corpora can generate remarkably fluent results in open-domain dialogue. However, for the persona-based dialogue generation task, consistency and coherence are also key factors, which are great challenges for language models. Existing works mainly focus on valuable data filtering, model structure modifying, or objective function designing, while their improvements are limited and hard to generalize to all types of pre-trained language models. However, we find that language models can produce consistent and coherent responses if we consider enough generations. Thus, the problems lay in large-scale response generation and target response selection. In this work, a simple but effective two-stage SimOAP strategy is proposed, i.e., over-sampling and post-evaluation. The over-sampling stage takes large-scale responses from existing trained models efficiently via off-the-shelf distilling and compressing methods, and the post-evaluation stage selects a good response based on multiple well-designed evaluation metrics from large-scale candidates. Experimental results show that the proposed plug-in SimOAP strategy improves the backbone models and outperforms the baseline strategies in both automatic and human evaluations.",
+    "authors": [
+      "Junkai Zhou",
+      "Liang Pang",
+      "Huawei Shen",
+      "Xueqi Cheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.553",
+    "point2d": [
+      13.146681785583496,
+      64.27671813964844
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 555,
+    "title": "NatLogAttack: A Framework for Attacking Natural Language Inference Models with Natural Logic",
+    "abstract": "Reasoning has been a central topic in artificial intelligence from the beginning. The recent progress made on distributed representation and neural networks continues to improve the state-of-the-art performance of natural language inference. However, it remains an open question whether the models perform real reasoning to reach their conclusions or rely on spurious correlations. Adversarial attacks have proven to be an important tool to help evaluate the Achilles\u2019 heel of the victim models. In this study, we explore the fundamental problem of developing attack models based on logic formalism. We propose NatLogAttack to perform systematic attacks centring around natural logic, a classical logic formalism that is traceable back to Aristotle\u2019s syllogism and has been closely developed for natural language inference. The proposed framework renders both label-preserving and label-flipping attacks.We show that compared to the existing attack models, NatLogAttack generates better adversarial examples with fewer visits to the victim models. The victim models are found to be more vulnerable under the label-flipping setting. NatLogAttack provides a tool to probe the existing and future NLI models\u2019 capacity from a key viewpoint and we hope more logic-based attacks will be further explored for understanding the desired property of reasoning.",
+    "authors": [
+      "Zi\u2019ou Zheng",
+      "Xiaodan Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.554",
+    "point2d": [
+      26.254058837890625,
+      -4.510188579559326
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 556,
+    "title": "Cognitive Reframing of Negative Thoughts through Human-Language Model Interaction",
+    "abstract": "A proven therapeutic technique to overcome negative thoughts is to replace them with a more hopeful \u201creframed thought.\u201d Although therapy can help people practice and learn this Cognitive Reframing of Negative Thoughts, clinician shortages and mental health stigma commonly limit people\u2019s access to therapy. In this paper, we conduct a human-centered study of how language models may assist people in reframing negative thoughts. Based on psychology literature, we define a framework of seven linguistic attributes that can be used to reframe a thought. We develop automated metrics to measure these attributes and validate them with expert judgements from mental health practitioners. We collect a dataset of 600 situations, thoughts and reframes from practitioners and use it to train a retrieval-enhanced in-context learning model that effectively generates reframed thoughts and controls their linguistic attributes. To investigate what constitutes a \u201chigh-quality\u201d reframe, we conduct an IRB-approved randomized field study on a large mental health website with over 2,000 participants. Amongst other findings, we show that people prefer highly empathic or specific reframes, as opposed to reframes that are overly positive. Our findings provide key implications for the use of LMs to assist people in overcoming negative thoughts.",
+    "authors": [
+      "Ashish Sharma",
+      "Kevin Rushton",
+      "Inna Lin",
+      "David Wadden",
+      "Khendra Lucas",
+      "Adam Miner",
+      "Theresa Nguyen",
+      "Tim Althoff"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.555",
+    "point2d": [
+      36.01646423339844,
+      69.72364807128906
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 557,
+    "title": "Dating Greek Papyri with Text Regression",
+    "abstract": "Dating Greek papyri accurately is crucial not only to edit their texts but also to understand numerous other aspects of ancient writing, document and book production and circulation, as well as various other aspects of administration, everyday life and intellectual history of antiquity. Although a substantial number of Greek papyri documents bear a date or other conclusive data as to their chronological placement, an even larger number can only be dated tentatively or in approximation, due to the lack of decisive evidence. By creating a dataset of 389 transcriptions of documentary Greek papyri, we train 389 regression models and we predict a date for the papyri with an average MAE of 54 years and an MSE of 1.17, outperforming image classifiers and other baselines. Last, we release date estimations for 159 manuscripts, for which only the upper limit is known.",
+    "authors": [
+      "John Pavlopoulos",
+      "Maria Konstantinidou",
+      "Isabelle Marthot-Santaniello",
+      "Holger Essler",
+      "Asimina Paparigopoulou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.556",
+    "point2d": [
+      -6.268496036529541,
+      24.087282180786133
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 558,
+    "title": "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions",
+    "abstract": "Prompting-based large language models (LLMs) are surprisingly powerful at generating natural language reasoning steps or Chains-of-Thoughts (CoT) for multi-step question answering (QA). They struggle, however, when the necessary knowledge is either unavailable to the LLM or not up-to-date within its parameters. While using the question to retrieve relevant text from an external knowledge source helps LLMs, we observe that this one-step retrieve-and-read approach is insufficient for multi-step QA. Here, what to retrieve depends on what has already been derived, which in turn may depend on what was previously retrieved. To address this, we propose IRCoT, a new approach for multi-step QA that interleaves retrieval with steps (sentences) in a CoT, guiding the retrieval with CoT and in turn using retrieved results to improve CoT. Using IRCoT with GPT3 substantially improves retrieval (up to 21 points) as well as downstream QA (up to 15 points) on four datasets: HotpotQA, 2WikiMultihopQA, MuSiQue, and IIRC. We observe similar substantial gains in out-of-distribution (OOD) settings as well as with much smaller models such as Flan-T5-large without additional training. IRCoT reduces model hallucination, resulting in factually more accurate CoT reasoning.",
+    "authors": [
+      "Harsh Trivedi",
+      "Niranjan Balasubramanian",
+      "Tushar Khot",
+      "Ashish Sabharwal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.557",
+    "point2d": [
+      66.6802978515625,
+      4.854610443115234
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 559,
+    "title": "Direct Fact Retrieval from Knowledge Graphs without Entity Linking",
+    "abstract": "There has been a surge of interest in utilizing Knowledge Graphs (KGs) for various natural language processing/understanding tasks. The conventional mechanism to retrieve facts in KGs usually involves three steps: entity span detection, entity disambiguation, and relation classification. However, this approach requires additional labels for training each of the three subcomponents in addition to pairs of input texts and facts, and also may accumulate errors propagated from failures in previous steps. To tackle these limitations, we propose a simple knowledge retrieval framework, which directly retrieves facts from the KGs given the input text based on their representational similarities, which we refer to as Direct Fact Retrieval (DiFaR). Specifically, we first embed all facts in KGs onto a dense embedding space by using a language model trained by only pairs of input texts and facts, and then provide the nearest facts in response to the input text. Since the fact, consisting of only two entities and one relation, has little context to encode, we propose to further refine ranks of top-k retrieved facts with a reranker that contextualizes the input text and the fact jointly. We validate our DiFaR framework on multiple fact retrieval tasks, showing that it significantly outperforms relevant baselines that use the three-step approach.",
+    "authors": [
+      "Jinheon Baek",
+      "Alham Fikri Aji",
+      "Jens Lehmann",
+      "Sung Ju Hwang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.558",
+    "point2d": [
+      52.39554977416992,
+      -62.037776947021484
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 560,
+    "title": "DisentQA: Disentangling Parametric and Contextual Knowledge with Counterfactual Question Answering",
+    "abstract": "Question answering models commonly have access to two sources of \u201cknowledge\u201d during inference time: (1) parametric knowledge - the factual knowledge encoded in the model weights, and (2) contextual knowledge - external knowledge (e.g., a Wikipedia passage) given to the model to generate a grounded answer. Having these two sources of knowledge entangled together is a core issue for generative QA models as it is unclear whether the answer stems from the given non-parametric knowledge or not. This unclarity has implications on issues of trust, interpretability and factuality. In this work, we propose a new paradigm in which QA models are trained to disentangle the two sources of knowledge. Using counterfactual data augmentation, we introduce a model that predicts two answers for a given question: one based on given contextual knowledge and one based on parametric knowledge. Our experiments on the Natural Questions dataset show that this approach improves the performance of QA models by making them more robust to knowledge conflicts between the two knowledge sources, while generating useful disentangled answers.",
+    "authors": [
+      "Ella Neeman",
+      "Roee Aharoni",
+      "Or Honovich",
+      "Leshem Choshen",
+      "Idan Szpektor",
+      "Omri Abend"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.559",
+    "point2d": [
+      63.31035232543945,
+      3.317595958709717
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 561,
+    "title": "A New Direction in Stance Detection: Target-Stance Extraction in the Wild",
+    "abstract": "Stance detection aims to detect the stance toward a corresponding target. Existing works use the assumption that the target is known in advance, which is often not the case in the wild. Given a text from social media platforms, the target information is often unknown due to implicit mentions in the source text and it is infeasible to have manual target annotations at a large scale. Therefore, in this paper, we propose a new task Target-Stance Extraction (TSE) that aims to extract the (target, stance) pair from the text. We benchmark the task by proposing a two-stage framework that first identifies the relevant target in the text and then detects the stance given the predicted target and text. Specifically, we first propose two different settings: Target Classification and Target Generation, to identify the potential target from a given text. Then we propose a multi-task approach that takes target prediction as the auxiliary task to detect the stance toward the predicted target. We evaluate the proposed framework on both in-target stance detection in which the test target is always seen in the training stage and zero-shot stance detection that needs to detect the stance for the targets that are unseen during the training phase. The new TSE task can facilitate future research in the field of stance detection.",
+    "authors": [
+      "Yingjie Li",
+      "Krishna Garg",
+      "Cornelia Caragea"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.560",
+    "point2d": [
+      49.769161224365234,
+      38.809810638427734
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 562,
+    "title": "Improved Instruction Ordering in Recipe-Grounded Conversation",
+    "abstract": "In this paper, we study the task of instructional dialogue and focus on the cooking domain. Analyzing the generated output of the GPT-J model, we reveal that the primary challenge for a recipe-grounded dialog system is how to provide the instructions in the correct order. We hypothesize that this is due to the model\u2019s lack of understanding of user intent and inability to track the instruction state (i.e., which step was last instructed). Therefore, we propose to explore two auxiliary subtasks, namely User Intent Detection and Instruction State Tracking, to support Response Generation with improved instruction grounding. Experimenting with our newly collected dataset, ChattyChef, shows that incorporating user intent and instruction state information helps the response generation model mitigate the incorrect order issue. Furthermore, to investigate whether ChatGPT has completely solved this task, we analyze its outputs and find that it also makes mistakes (10.7% of the responses), about half of which are out-of-order instructions. We will release ChattyChef to facilitate further research in this area at: https://github.com/octaviaguo/ChattyChef.",
+    "authors": [
+      "Duong Le",
+      "Ruohao Guo",
+      "Wei Xu",
+      "Alan Ritter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.561",
+    "point2d": [
+      17.786027908325195,
+      65.59845733642578
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 563,
+    "title": "Token-wise Decomposition of Autoregressive Language Model Hidden States for Analyzing Model Predictions",
+    "abstract": "While there is much recent interest in studying why Transformer-based large language models make predictions the way they do, the complex computations performed within each layer have made their behavior somewhat opaque. To mitigate this opacity, this work presents a linear decomposition of final hidden states from autoregressive language models based on each initial input token, which is exact for virtually all contemporary Transformer architectures. This decomposition allows the definition of probability distributions that ablate the contribution of specific input tokens, which can be used to analyze their influence on model probabilities over a sequence of upcoming words with only one forward pass from the model. Using the change in next-word probability as a measure of importance, this work first examines which context words make the biggest contribution to language model predictions. Regression experiments suggest that Transformer-based language models rely primarily on collocational associations, followed by linguistic factors such as syntactic dependencies and coreference relationships in making next-word predictions. Additionally, analyses using these measures to predict syntactic dependencies and coreferent mention spans show that collocational association and repetitions of the same token largely explain the language models\u2019 predictions on these tasks.",
+    "authors": [
+      "Byung-Doh Oh",
+      "William Schuler"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.562",
+    "point2d": [
+      -46.27976608276367,
+      -32.268314361572266
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 564,
+    "title": "Document-Level Multi-Event Extraction with Event Proxy Nodes and Hausdorff Distance Minimization",
+    "abstract": "Document-level multi-event extraction aims to extract the structural information from a given document automatically. Most recent approaches usually involve two steps: (1) modeling entity interactions; (2) decoding entity interactions into events. However, such approaches ignore a global view of inter-dependency of multiple events. Moreover, an event is decoded by iteratively merging its related entities as arguments, which might suffer from error propagation and is computationally inefficient. In this paper, we propose an alternative approach for document-level multi-event extraction with event proxy nodes and Hausdorff distance minimization. The event proxy nodes, representing pseudo-events, are able to build connections with other event proxy nodes, essentially capturing global information. The Hausdorff distance makes it possible to compare the similarity between the set of predicted events and the set of ground-truth events. By directly minimizing Hausdorff distance, the model is trained towards the global optimum directly, which improves performance and reduces training time. Experimental results show that our model outperforms previous state-of-the-art method in F1-score on two datasets with only a fraction of training time.",
+    "authors": [
+      "Xinyu Wang",
+      "Lin Gui",
+      "Yulan He"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.563",
+    "point2d": [
+      46.851226806640625,
+      -50.43425369262695
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 565,
+    "title": "Dialog-Post: Multi-Level Self-Supervised Objectives and Hierarchical Model for Dialogue Post-Training",
+    "abstract": "Dialogue representation and understanding aim to convert conversational inputs into embeddings and fulfill discriminative tasks. Compared with free-form text, dialogue has two important characteristics, hierarchical semantic structure and multi-facet attributes. Therefore, directly applying the pretrained language models (PLMs) might result in unsatisfactory performance. Recently, several work focused on the dialogue-adaptive post-training (DialPost) that further trains PLMs to fit dialogues. To model dialogues more comprehensively, we propose a DialPost method, Dialog-Post, with multi-level self-supervised objectives and a hierarchical model. These objectives leverage dialogue-specific attributes and use self-supervised signals to fully facilitate the representation and understanding of dialogues. The novel model is a hierarchical segment-wise self-attention network, which contains inner-segment and inter-segment self-attention sub-layers followed by an aggregation and updating module. To evaluate the effectiveness of our methods, we first apply two public datasets for the verification of representation ability. Then we conduct experiments on a newly-labelled dataset that is annotated with 4 dialogue understanding tasks. Experimental results show that our method outperforms existing SOTA models and achieves a 3.3% improvement on average.",
+    "authors": [
+      "Zhenyu Zhang",
+      "Lei Shen",
+      "Yuming Zhao",
+      "Meng Chen",
+      "Xiaodong He"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.564",
+    "point2d": [
+      4.678572654724121,
+      66.7567367553711
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 566,
+    "title": "Language Detoxification with Attribute-Discriminative Latent Space",
+    "abstract": "Transformer-based Language Models (LMs) have achieved impressive results on natural language understanding tasks, but they can also generate toxic text such as insults, threats, and profanity, limiting their real-world applications. To overcome this issue, a few text generation approaches aim to detoxify toxic texts using additional LMs or perturbations. However, previous methods require excessive memory, computations, and time which are serious bottlenecks in their real-world application. To address such limitations, we propose an effective yet efficient method for language detoxification using an attribute-discriminative latent space. Specifically, we project the latent space of an original Transformer LM onto a discriminative latent space that well-separates texts by their attributes using a projection block and an attribute discriminator. This allows the LM to control the text generation to be non-toxic with minimal memory and computation overhead. We validate our model, Attribute-Discriminative Language Model (ADLM) on detoxified language and dialogue generation tasks, on which our method significantly outperforms baselines both in performance and efficiency.",
+    "authors": [
+      "Jin Myung Kwak",
+      "Minseon Kim",
+      "Sung Ju Hwang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.565",
+    "point2d": [
+      6.88231897354126,
+      15.185500144958496
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 567,
+    "title": "Just Like a Human Would, Direct Access to Sarcasm Augmented with Potential Result and Reaction",
+    "abstract": "Sarcasm, as a form of irony conveying mockery and contempt, has been widespread in social media such as Twitter and Weibo, where the sarcastic text is commonly characterized as an incongruity between the surface positive and negative situation. Naturally, it has an urgent demand to automatically identify sarcasm from social media, so as to illustrate people\u2019s real views toward specific targets. In this paper, we develop a novel sarcasm detection method, namely Sarcasm Detector with Augmentation of Potential Result and Reaction (SD-APRR). Inspired by the direct access view, we treat each sarcastic text as an incomplete version without latent content associated with implied negative situations, including the result and human reaction caused by its observable content. To fill the latent content, we estimate the potential result and human reaction for each given training sample by [xEffect] and [xReact] relations inferred by the pre-trained commonsense reasoning tool COMET, and integrate the sample with them as an augmented one. We can then employ those augmented samples to train the sarcasm detector, whose encoder is a graph neural network with a denoising module. We conduct extensive empirical experiments to evaluate the effectiveness of SD-APRR. The results demonstrate that SD-APRR can outperform strong baselines on benchmark datasets.",
+    "authors": [
+      "Changrong Min",
+      "Ximing Li",
+      "Liang Yang",
+      "Zhilin Wang",
+      "Bo Xu",
+      "Hongfei Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.566",
+    "point2d": [
+      -24.376081466674805,
+      62.424251556396484
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 568,
+    "title": "Adaptive and Personalized Exercise Generation for Online Language Learning",
+    "abstract": "Adaptive learning aims to provide customized educational activities (e.g., exercises) to address individual learning needs. However, manual construction and delivery of such activities is a laborious process. Thus, in this paper, we study a novel task of adaptive and personalized exercise generation for online language learning. To this end, we combine a knowledge tracing model that estimates each student\u2019s evolving knowledge states from their learning history and a controlled text generation model that generates exercise sentences based on the student\u2019s current estimated knowledge state and instructor requirements of desired properties (e.g., domain knowledge and difficulty). We train and evaluate our model on real-world learner interaction data from Duolingo and demonstrate that LMs guided by student states can generate superior exercises. Then, we discuss the potential use of our model in educational applications using various simulations. These simulations show that our model can adapt to students\u2019 individual abilities and can facilitate their learning efficiency by personalizing learning sequences.",
+    "authors": [
+      "Peng Cui",
+      "Mrinmaya Sachan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.567",
+    "point2d": [
+      13.876944541931152,
+      47.09143829345703
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 569,
+    "title": "NLP Reproducibility For All: Understanding Experiences of Beginners",
+    "abstract": "As natural language processing (NLP) has recently seen an unprecedented level of excitement, and more people are eager to enter the field, it is unclear whether current research reproducibility efforts are sufficient for this group of beginners to apply the latest developments. To understand their needs, we conducted a study with 93 students in an introductory NLP course, where students reproduced the results of recent NLP papers. Surprisingly, we find that their programming skill and comprehension of research papers have a limited impact on their effort spent completing the exercise. Instead, we find accessibility efforts by research authors to be the key to success, including complete documentation, better coding practice, and easier access to data files. Going forward, we recommend that NLP researchers pay close attention to these simple aspects of open-sourcing their work, and use insights from beginners\u2019 feedback to provide actionable ideas on how to better support them.",
+    "authors": [
+      "Shane Storks",
+      "Keunwoo Yu",
+      "Ziqiao Ma",
+      "Joyce Chai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.568",
+    "point2d": [
+      20.354095458984375,
+      16.223249435424805
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 570,
+    "title": "Why Did the Chicken Cross the Road? Rephrasing and Analyzing Ambiguous Questions in VQA",
+    "abstract": "Natural language is ambiguous. Resolving ambiguous questions is key to successfully answering them.Focusing on questions about images, we create a dataset of ambiguous examples. We annotate these, grouping answers by the underlying question they address and rephrasing the question for each group to reduce ambiguity. Our analysis reveals a linguistically-aligned ontology of reasons for ambiguity in visual questions. We then develop an English question-generation model which we demonstrate via automatic and human evaluation produces less ambiguous questions. We further show that the question generation objective we use allows the model to integrate answer group information without any direct supervision.",
+    "authors": [
+      "Elias Stengel-Eskin",
+      "Jimena Guallar-Blasco",
+      "Yi Zhou",
+      "Benjamin Van Durme"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.569",
+    "point2d": [
+      57.73307800292969,
+      4.766653537750244
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 571,
+    "title": "UMRSpell: Unifying the Detection and Correction Parts of Pre-trained Models towards Chinese Missing, Redundant, and Spelling Correction",
+    "abstract": "Chinese Spelling Correction (CSC) is the task of detecting and correcting misspelled charac- ters in Chinese texts. As an important step for various downstream tasks, CSC confronts two challenges: 1) Character-level errors consist not only of spelling errors but also of missing and redundant ones that cause variable length between input and output texts, for which most CSC methods could not handle well because of the consistence length of texts required by their inherent detection-correction framework. Con- sequently, the two errors are considered out- side the scope and left to future work, despite the fact that they are widely found and bound to CSC task in Chinese industrial scenario, such as Automatic Speech Recognition (ASR) and Optical Character Recognition (OCR). 2) Most existing CSC methods focus on either detector or corrector and train different mod- els for each one, respectively, leading to in- sufficiency of parameters sharing. To address these issues, we propose a novel model UMR- Spell to learn detection and correction parts together at the same time from a multi-task learning perspective by using a detection trans- mission self-attention matrix, and flexibly deal with both missing, redundant, and spelling er- rors through re-tagging rules. Furthermore, we build a new dataset ECMR-2023 containing five kinds of character-level errors to enrich the CSC task closer to real-world applications. Ex- periments on both SIGHAN benchmarks and ECMR-2023 demonstrate the significant effec- tiveness of UMRSpell over previous represen- tative baselines.",
+    "authors": [
+      "Zheyu He",
+      "Yujin Zhu",
+      "Linlin Wang",
+      "Liang Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.570",
+    "point2d": [
+      -39.66709518432617,
+      8.677322387695312
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 572,
+    "title": "LAIT: Efficient Multi-Segment Encoding in Transformers with Layer-Adjustable Interaction",
+    "abstract": "Transformer encoders contextualize token representations by attending to all other tokens at each layer, leading to quadratic increase in compute effort with the input length. In practice, however, the input text of many NLP tasks can be seen as a sequence of related segments (e.g., the sequence of sentences within a passage, or the hypothesis and premise in NLI). While attending across these segments is highly beneficial for many tasks, we hypothesize that this interaction can be delayed until later encoding stages.To this end, we introduce Layer-Adjustable Interactions in Transformers (LAIT). Within LAIT, segmented inputs are first encoded independently, and then jointly. This partial two-tower architecture bridges the gap between a Dual Encoder\u2019s ability to pre-compute representations for segments and a fully self-attentive Transformer\u2019s capacity to model cross-segment attention. The LAIT framework effectively leverages existing pretrained Transformers and converts them into the hybrid of the two aforementioned architectures, allowing for easy and intuitive control over the performance-efficiency tradeoff. Experimenting on a wide range of NLP tasks, we find LAIT able to reduce 30-50% of the attention FLOPs on many tasks, while preserving high accuracy; in some practical settings, LAIT could reduce actual latency by orders of magnitude.",
+    "authors": [
+      "Jeremiah Milbauer",
+      "Annie Louis",
+      "Mohammad Javad Hosseini",
+      "Alex Fabrikant",
+      "Donald Metzler",
+      "Tal Schuster"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.571",
+    "point2d": [
+      -36.45709991455078,
+      -29.788814544677734
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 573,
+    "title": "Local Interpretation of Transformer Based on Linear Decomposition",
+    "abstract": "In recent years, deep neural networks (DNNs) have achieved state-of-the-art performance on a wide range of tasks. However, limitations in interpretability have hindered their applications in the real world. This work proposes to interpret neural networks by linear decomposition and finds that the ReLU-activated Transformer can be considered as a linear model on a single input. We further leverage the linearity of the model and propose a linear decomposition of the model output to generate local explanations. Our evaluation of sentiment classification and machine translation shows that our method achieves competitive performance in efficiency and fidelity of explanation. In addition, we demonstrate the potential of our approach in applications with examples of error analysis on multiple tasks.",
+    "authors": [
+      "Sen Yang",
+      "Shujian Huang",
+      "Wei Zou",
+      "Jianbing Zhang",
+      "Xinyu Dai",
+      "Jiajun Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.572",
+    "point2d": [
+      -43.581459045410156,
+      -29.989255905151367
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 574,
+    "title": "DataFinder: Scientific Dataset Recommendation from Natural Language Descriptions",
+    "abstract": "Modern machine learning relies on datasets to develop and validate research ideas. Given the growth of publicly available data, finding the right dataset to use is increasingly difficult. Any research question imposes explicit and implicit constraints on how well a given dataset will enable researchers to answer this question, such as dataset size, modality, and domain. We operationalize the task of recommending datasets given a short natural language description of a research idea, to help people find relevant datasets for their needs. Dataset recommendation poses unique challenges as an information retrieval problem; datasets are hard to directly index for search and there are no corpora readily available for this task. To facilitate this task, we build the DataFinder Dataset which consists of a larger automatically-constructed training set (17.5K queries) and a smaller expert-annotated evaluation set (392 queries). Using this data, we compare various information retrieval algorithms on our test set and present a superior bi-encoder retriever for text-based dataset recommendation. This system, trained on the DataFinder Dataset, finds more relevant search results than existing third-party dataset search engines. To encourage progress on dataset recommendation, we release our dataset and models to the public.",
+    "authors": [
+      "Vijay Viswanathan",
+      "Luyu Gao",
+      "Tongshuang Wu",
+      "Pengfei Liu",
+      "Graham Neubig"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.573",
+    "point2d": [
+      15.68000316619873,
+      5.001099586486816
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 575,
+    "title": "Multilingual Event Extraction from Historical Newspaper Adverts",
+    "abstract": "NLP methods can aid historians in analyzing textual materials in greater volumes than manually feasible. Developing such methods poses substantial challenges though. First, acquiring large, annotated historical datasets is difficult, as only domain experts can reliably label them. Second, most available off-the-shelf NLP models are trained on modern language texts, rendering them significantly less effective when applied to historical corpora. This is particularly problematic for less well studied tasks, and for languages other than English. This paper addresses these challenges while focusing on the under-explored task of event extraction from a novel domain of historical texts. We introduce a new multilingual dataset in English, French, and Dutch composed of newspaper ads from the early modern colonial period reporting on enslaved people who liberated themselves from enslavement. We find that: 1) even with scarce annotated data, it is possible to achieve surprisingly good results by formulating the problem as an extractive QA task and leveraging existing datasets and models for modern languages; and 2) cross-lingual low-resource learning for historical languages is highly challenging, and machine translation of the historical datasets to the considered target languages is, in practice, often the best-performing solution.",
+    "authors": [
+      "Nadav Borenstein",
+      "Nat\u00e1lia da Silva Perez",
+      "Isabelle Augenstein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.574",
+    "point2d": [
+      45.58889389038086,
+      -41.6641731262207
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 576,
+    "title": "BIC: Twitter Bot Detection with Text-Graph Interaction and Semantic Consistency",
+    "abstract": "Twitter bots are automatic programs operated by malicious actors to manipulate public opinion and spread misinformation. Research efforts have been made to automatically identify bots based on texts and networks on social media. Existing methods only leverage texts or networks alone, and while few works explored the shallow combination of the two modalities, we hypothesize that the interaction and information exchange between texts and graphs could be crucial for holistically evaluating bot activities on social media. In addition, according to a recent survey (Cresci, 2020), Twitter bots are constantly evolving while advanced bots steal genuine users\u2019 tweets and dilute their malicious content to evade detection. This results in greater inconsistency across the timeline of novel Twitter bots, which warrants more attention. In light of these challenges, we propose BIC, a Twitter Bot detection framework with text-graph Interaction and semantic Consistency. Specifically, in addition to separately modeling the two modalities on social media, BIC employs a text-graph interaction module to enable information exchange across modalities in the learning process. In addition, given the stealing behavior of novel Twitter bots, BIC proposes to model semantic consistency in tweets based on attention weights while using it to augment the decision process. Extensive experiments demonstrate that BIC consistently outperforms state-of-the-art baselines on two widely adopted datasets. Further analyses reveal that text-graph interactions and modeling semantic consistency are essential improvements and help combat bot evolution.",
+    "authors": [
+      "Zhenyu Lei",
+      "Herun Wan",
+      "Wenqian Zhang",
+      "Shangbin Feng",
+      "Zilong Chen",
+      "Jundong Li",
+      "Qinghua Zheng",
+      "Minnan Luo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.575",
+    "point2d": [
+      32.5938720703125,
+      21.94643783569336
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 577,
+    "title": "Do I have the Knowledge to Answer? Investigating Answerability of Knowledge Base Questions",
+    "abstract": "When answering natural language questions over knowledge bases, missing facts, incomplete schema and limited scope naturally lead to many questions being unanswerable. While answerability has been explored in other QA settings, it has not been studied for QA over knowledge bases (KBQA). We create GrailQAbility, a new benchmark KBQA dataset with unanswerability, by first identifying various forms of KB incompleteness that make questions unanswerable, and then systematically adapting GrailQA (a popular KBQA dataset with only answerable questions). Experimenting with three state-of-the-art KBQA models, we find that all three models suffer a drop in performance even after suitable adaptation for unanswerable questions. In addition, these often detect unanswerability for wrong reasons and find specific forms of unanswerability particularly difficult to handle. This underscores the need for further research in making KBQA systems robust to unanswerability.",
+    "authors": [
+      "Mayur Patidar",
+      "Prayushi Faldu",
+      "Avinash Singh",
+      "Lovekesh Vig",
+      "Indrajit Bhattacharya",
+      "Mausam -"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.576",
+    "point2d": [
+      64.43446350097656,
+      1.122920036315918
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 578,
+    "title": "Understanding Client Reactions in Online Mental Health Counseling",
+    "abstract": "Communication success relies heavily on reading participants\u2019 reactions. Such feedback is especially important for mental health counselors, who must carefully consider the client\u2019s progress and adjust their approach accordingly. However, previous NLP research on counseling has mainly focused on studying counselors\u2019 intervention strategies rather than their clients\u2019 reactions to the intervention. This work aims to fill this gap by developing a theoretically grounded annotation framework that encompasses counselors\u2019 strategies and client reaction behaviors. The framework has been tested against a large-scale, high-quality text-based counseling dataset we collected over the past two years from an online welfare counseling platform. Our study show how clients react to counselors\u2019 strategies, how such reactions affect the final counseling outcomes, and how counselors can adjust their strategies in response to these reactions. We also demonstrate that this study can help counselors automatically predict their clients\u2019 states.",
+    "authors": [
+      "Anqi Li",
+      "Lizhi Ma",
+      "Yaling Mei",
+      "Hongliang He",
+      "Shuai Zhang",
+      "Huachuan Qiu",
+      "Zhenzhong Lan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.577",
+    "point2d": [
+      31.29441261291504,
+      65.459716796875
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 579,
+    "title": "Nonlinear Structural Equation Model Guided Gaussian Mixture Hierarchical Topic Modeling",
+    "abstract": "Hierarchical topic models, which can extract semantically meaningful topics from a textcorpus in an unsupervised manner and automatically organise them into a topic hierarchy, have been widely used to discover the underlying semantic structure of documents. However, the existing models often assume in the prior that the topic hierarchy is a tree structure, ignoring symmetrical dependenciesbetween topics at the same level. Moreover, the sparsity of text data often complicate the analysis. To address these issues, we propose NSEM-GMHTM as a deep topic model, witha Gaussian mixture prior distribution to improve the model\u2019s ability to adapt to sparse data, which explicitly models hierarchical and symmetric relations between topics through the dependency matrices and nonlinear structural equations. Experiments on widely used datasets show that our NSEM-GMHTM generates more coherent topics and a more rational topic structure when compared to state-of-theart baselines. Our code is available at https: //github.com/nbnbhwyy/NSEM-GMHTM.",
+    "authors": [
+      "HeGang Chen",
+      "Pengbo Mao",
+      "Yuyin Lu",
+      "Yanghui Rao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.578",
+    "point2d": [
+      -31.949430465698242,
+      0.8493314385414124
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 580,
+    "title": "Revisiting Token Dropping Strategy in Efficient BERT Pretraining",
+    "abstract": "Token dropping is a recently-proposed strategy to speed up the pretraining of masked language models, such as BERT, by skipping the computation of a subset of the input tokens at several middle layers. It can effectively reduce the training time without degrading much performance on downstream tasks. However, we empirically find that token dropping is prone to a semantic loss problem and falls short in handling semantic-intense tasks. Motivated by this, we propose a simple yet effective semantic-consistent learning method (ScTD) to improve the token dropping. ScTD aims to encourage the model to learn how to preserve the semantic information in the representation space. Extensive experiments on 12 tasks show that, with the help of our ScTD, token dropping can achieve consistent and significant performance gains across all task types and model sizes. More encouragingly, ScTD saves up to 57% of pretraining time and brings up to +1.56% average improvement over the vanilla token dropping.",
+    "authors": [
+      "Qihuang Zhong",
+      "Liang Ding",
+      "Juhua Liu",
+      "Xuebo Liu",
+      "Min Zhang",
+      "Bo Du",
+      "Dacheng Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.579",
+    "point2d": [
+      -30.776460647583008,
+      -25.033891677856445
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 581,
+    "title": "The Benefits of Bad Advice: Autocontrastive Decoding across Model Layers",
+    "abstract": "Applying language models to natural language processing tasks typically relies on the representations in the final model layer, as intermediate hidden layer representations are presumed to be less informative. In this work, we argue that due to the gradual improvement across model layers, additional information can be gleaned from the contrast between higher and lower layers during inference. Specifically, in choosing between the probable next token predictions of a generative model, the predictions of lower layers can be used to highlight which candidates are best avoided. We propose a novel approach that utilizes the contrast between layers to improve text generation outputs, and show that it mitigates degenerative behaviors of the model in open-ended generation, significantly improving the quality of generated texts. Furthermore, our results indicate that contrasting between model layers at inference time can yield substantial benefits to certain aspects of general language model capabilities, more effectively extracting knowledge during inference from a given set of model parameters.",
+    "authors": [
+      "Ariel Gera",
+      "Roni Friedman",
+      "Ofir Arviv",
+      "Chulaka Gunasekara",
+      "Benjamin Sznajder",
+      "Noam Slonim",
+      "Eyal Shnarch"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.580",
+    "point2d": [
+      -25.722721099853516,
+      13.84615421295166
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 582,
+    "title": "FACTIFY-5WQA: 5W Aspect-based Fact Verification through Question Answering",
+    "abstract": "Automatic fact verification has received significant attention recently. Contemporary automatic fact-checking systems focus on estimating truthfulness using numerical scores which are not human-interpretable. A human fact-checker generally follows several logical steps to verify a verisimilitude claim and conclude whether it\u2019s truthful or a mere masquerade. Popular fact-checking websites follow a common structure for fact categorization such as half true, half false, false, pants on fire, etc. Therefore, it is necessary to have an aspect-based (delineating which part(s) are true and which are false) explainable system that can assist human fact-checkers in asking relevant questions related to a fact, which can then be validated separately to reach a final verdict. In this paper, we propose a 5W framework (who, what, when, where, and why) for question-answer-based fact explainability. To that end, we present a semi-automatically generated dataset called FACTIFY-5WQA, which consists of 391, 041 facts along with relevant 5W QAs \u2013 underscoring our major contribution to this paper. A semantic role labeling system has been utilized to locate 5Ws, which generates QA pairs for claims using a masked language model. Finally, we report a baseline QA system to automatically locate those answers from evidence documents, which can serve as a baseline for future research in the field. Lastly, we propose a robust fact verification system that takes paraphrased claims and automatically validates them. The dataset and the baseline model are available at https: //github.com/ankuranii/acl-5W-QA",
+    "authors": [
+      "Anku Rani",
+      "S.M Towhidul Islam Tonmoy",
+      "Dwip Dalal",
+      "Shreya Gautam",
+      "Megha Chakraborty",
+      "Aman Chadha",
+      "Amit Sheth",
+      "Amitava Das"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.581",
+    "point2d": [
+      62.91768264770508,
+      -9.782918930053711
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 583,
+    "title": "Naamapadam: A Large-Scale Named Entity Annotated Data for Indic Languages",
+    "abstract": "We present, Naamapadam, the largest publicly available Named Entity Recognition (NER) dataset for the 11 major Indian languages from two language families. The dataset contains more than 400k sentences annotated with a total of at least 100k entities from three standard entity categories (Person, Location, and, Organization) for 9 out of the 11 languages. The training dataset has been automatically created from the Samanantar parallel corpus by projecting automatically tagged entities from an English sentence to the corresponding Indian language translation. We also create manually annotated testsets for 9 languages. We demonstrate the utility of the obtained dataset on the Naamapadam-test dataset. We also release IndicNER, a multilingual IndicBERT model fine-tuned on Naamapadam training set. IndicNER achieves an F1 score of more than 80 for 7 out of 9 test languages. The dataset and models are available under open-source licences at https://ai4bharat.iitm.ac.in/naamapadam.",
+    "authors": [
+      "Arnav Mhaske",
+      "Harshit Kedia",
+      "Sumanth Doddapaneni",
+      "Mitesh M. Khapra",
+      "Pratyush Kumar",
+      "Rudra Murthy",
+      "Anoop Kunchukuttan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.582",
+    "point2d": [
+      26.924644470214844,
+      -83.87064361572266
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 584,
+    "title": "CREPE: Open-Domain Question Answering with False Presuppositions",
+    "abstract": "When asking about unfamiliar topics, information seeking users often pose questions with false presuppositions. Most existing question answering (QA) datasets, in contrast, assume all questions have well defined answers. We introduce CREPE, a QA dataset containing a natural distribution of presupposition failures from online information-seeking forums. We find that 25% of questions contain false presuppositions, and provide annotations for these presuppositions and their corrections. Through extensive baseline experiments, we show that adaptations of existing open-domain QA models can find presuppositions moderately well, but struggle when predicting whether a presupposition is factually correct. This is in large part due to difficulty in retrieving relevant evidence passages from a large text corpus. CREPE provides a benchmark to study question answering in the wild, and our analyses provide avenues for future work in better modeling and further studying the task.",
+    "authors": [
+      "Xinyan Yu",
+      "Sewon Min",
+      "Luke Zettlemoyer",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.583",
+    "point2d": [
+      63.54585647583008,
+      -0.4762803614139557
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 585,
+    "title": "Joint Document-Level Event Extraction via Token-Token Bidirectional Event Completed Graph",
+    "abstract": "We solve the challenging document-level event extraction problem by proposing a joint exaction methodology that can avoid inefficiency and error propagation issues in classic pipeline methods. Essentially, we address the three crucial limitations in existing studies. First, the autoregressive strategy of path expansion heavily relies on the orders of argument role. Second, the number of events in documents must be specified in advance. Last, unexpected errors usually exist when decoding events based on the entity-entity adjacency matrix. To address these issues, this paper designs a Token-Token Bidirectional Event Completed Graph (TT-BECG) in which the relation eType-Role1-Role2 serves as the edge type, precisely revealing which tokens play argument roles in an event of a specific event type. Exploiting the token-token adjacency matrix of the TT-BECG, we develop an edge-enhanced joint document-level event extraction model. Guided by the target token-token adjacency matrix, the predicted token-token adjacency matrix can be obtained during the model training. Then, extracted events and event records in a document are decoded based on the predicted matrix, including the graph structure and edge type decoding. Extensive experiments are conducted on two public datasets, and the results confirm the effectiveness of our method and its superiority over the state-of-the-art baselines.",
+    "authors": [
+      "Qizhi Wan",
+      "Changxuan Wan",
+      "Keli Xiao",
+      "Dexi Liu",
+      "Chenliang Li",
+      "Bolong Zheng",
+      "Xiping Liu",
+      "Rong Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.584",
+    "point2d": [
+      46.44937515258789,
+      -50.44858932495117
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 586,
+    "title": "Robust Representation Learning with Reliable Pseudo-labels Generation via Self-Adaptive Optimal Transport for Short Text Clustering",
+    "abstract": "Short text clustering is challenging since it takes imbalanced and noisy data as inputs. Existing approaches cannot solve this problem well, since (1) they are prone to obtain degenerate solutions especially on heavy imbalanced datasets, and (2) they are vulnerable to noises. To tackle the above issues, we propose a Robust Short Text Clustering (RSTC) model to improve robustness against imbalanced and noisy data. RSTC includes two modules, i.e., pseudo-label generation module and robust representation learning module. The former generates pseudo-labels to provide supervision for the later, which contributes to more robust representations and correctly separated clusters. To provide robustness against the imbalance in data, we propose self-adaptive optimal transport in the pseudo-label generation module. To improve robustness against the noise in data, we further introduce both class-wise and instance-wise contrastive learning in the robust representation learning module. Our empirical studies on eight short text clustering datasets demonstrate that RSTC significantly outperforms the state-of-the-art models.",
+    "authors": [
+      "Xiaolin Zheng",
+      "Mengling Hu",
+      "Weiming Liu",
+      "Chaochao Chen",
+      "Xinting Liao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.585",
+    "point2d": [
+      -1.624906063079834,
+      -24.637821197509766
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 587,
+    "title": "Multilingual Knowledge Graph Completion with Language-Sensitive Multi-Graph Attention",
+    "abstract": "Multilingual Knowledge Graph Completion (KGC) aims to predict missing links with multilingual knowledge graphs. However, existing approaches suffer from two main drawbacks: (a) alignment dependency: the multilingual KGC is always realized with joint entity or relation alignment, which introduces additional alignment models and increases the complexity of the whole framework; (b) training inefficiency: the trained model will only be used for the completion of one target KG, although the data from all KGs are used simultaneously. To address these drawbacks, we propose a novel multilingual KGC framework with language-sensitive multi-graph attention such that the missing links on all given KGs can be inferred by a universal knowledge completion model. Specifically, we first build a relational graph neural network by sharing the embeddings of aligned nodes to transfer language-independent knowledge. Meanwhile, a language-sensitive multi-graph attention (LSMGA) is proposed to deal with the information inconsistency among different KGs. Experimental results show that our model achieves significant improvements on the DBP-5L and E-PKG datasets.",
+    "authors": [
+      "Rongchuan Tang",
+      "Yang Zhao",
+      "Chengqing Zong",
+      "Yu Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.586",
+    "point2d": [
+      53.733428955078125,
+      -65.55597686767578
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 588,
+    "title": "What are the Desired Characteristics of Calibration Sets? Identifying Correlates on Long Form Scientific Summarization",
+    "abstract": "Summarization models often generate text that is poorly calibrated to quality metrics because they are trained to maximize the likelihood of a single reference (MLE). To address this, recent work has added a calibration step, which exposes a model to its own ranked outputs to improve relevance or, in a separate line of work, contrasts positive and negative sets to improve faithfulness. While effective, much of this work has focused on how to generate and optimize these sets. Less is known about why one setup is more effective than another. In this work, we uncover the underlying characteristics of effective sets. For each training instance, we form a large, diverse pool of candidates and systematically vary the subsets used for calibration fine-tuning. Each selection strategy targets distinct aspects of the sets, such as lexical diversity or the size of the gap between positive and negatives. On three diverse scientific long-form summarization datasets (spanning biomedical, clinical, and chemical domains), we find, among others, that faithfulness calibration is optimal when the negative sets are extractive and more likely to be generated, whereas for relevance calibration, the metric margin between candidates should be maximized and surprise\u2013the disagreement between model and metric defined candidate rankings\u2013minimized.",
+    "authors": [
+      "Griffin Adams",
+      "Bichlien Nguyen",
+      "Jake Smith",
+      "Yingce Xia",
+      "Shufang Xie",
+      "Anna Ostropolets",
+      "Budhaditya Deb",
+      "Yuan-Jyue Chen",
+      "Tristan Naumann",
+      "No\u00e9mie Elhadad"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.587",
+    "point2d": [
+      -6.873824596405029,
+      34.17621612548828
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 589,
+    "title": "Annotating Mentions Alone Enables Efficient Domain Adaptation for Coreference Resolution",
+    "abstract": "Although recent neural models for coreference resolution have led to substantial improvements on benchmark datasets, it remains a challenge to successfully transfer these models to new target domains containing many out-of-vocabulary spans and requiring differing annotation schemes. Typical approaches involve continued training on annotated target-domain data, but obtaining annotations is costly and time-consuming. In this work, we show that adapting mention detection is the key component to successful domain adaptation of coreference models, rather than antecedent linking. We also show annotating mentions alone is nearly twice as fast as annotating full coreference chains. Based on these insights, we propose a method for efficiently adapting coreference models, which includes a high-precision mention detection objective and requires only mention annotations in the target domain. Extensive evaluation across three English coreference datasets: CoNLL-2012 (news/conversation), i2b2/VA (medical notes), and child welfare notes, reveals that our approach facilitates annotation-efficient transfer and results in a 7-14% improvement in average F1 without increasing annotator time.",
+    "authors": [
+      "Nupoor Gandhi",
+      "Anjalie Field",
+      "Emma Strubell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.588",
+    "point2d": [
+      40.40717697143555,
+      -77.16950225830078
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 590,
+    "title": "A Universal Discriminator for Zero-Shot Generalization",
+    "abstract": "Generative modeling has been the dominant approach for large-scale pretraining and zero-shot generalization. In this work, we challenge this convention by showing that discriminative approaches perform substantially better than generative ones on a large number of NLP tasks. Technically, we train a single discriminator to predict whether a text sample comes from the true data distribution, similar to GANs. Since many NLP tasks can be formulated as selecting from a few options, we use this discriminator to predict the concatenation of input and which option has the highest probability of coming from the true data distribution. This simple formulation achieves state-of-the-art zero-shot results on the T0 benchmark, outperforming T0 by 16.0%, 7.8%, and 11.5% respectively on different scales. In the finetuning setting, our approach also achieves new state-of-the-art results on a wide range of NLP tasks, with only 1/4 parameters of previous methods. Meanwhile, our approach requires minimal prompting efforts, which largely improves robustness and is essential for real-world applications. Furthermore, we also jointly train a generalized UD in combination with generative tasks, which maintains its advantage on discriminative tasks and simultaneously works on generative tasks.",
+    "authors": [
+      "Haike Xu",
+      "Zongyu Lin",
+      "Jing Zhou",
+      "Yanan Zheng",
+      "Zhilin Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.589",
+    "point2d": [
+      -18.560514450073242,
+      -1.5323071479797363
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 591,
+    "title": "Syntax and Geometry of Information",
+    "abstract": "This paper presents an information-theoretical model of syntactic generalization. We study syntactic generalization from the perspective of the capacity to disentangle semantic and structural information, emulating the human capacity to assign a grammaticality judgment to semantically nonsensical sentences. In order to isolate the structure, we propose to represent the probability distribution behind a corpus as the product of the probability of a semantic context and the probability of a structure, the latter being independent of the former. We further elaborate the notion of abstraction as a relaxation of the property of independence. It is based on the measure of structural and contextual information for a given representation. We test abstraction as an optimization objective on the task of inducing syntactic categories from natural language data and show that it significantly outperforms alternative methods. Furthermore, we find that when syntax-unaware optimization objectives succeed in the task, their success is mainly due to an implicit disentanglement process rather than to the model structure. On the other hand, syntactic categories can be deduced in a principled way from the independence between structure and context.",
+    "authors": [
+      "Rapha\u00ebl Bailly",
+      "Laurent Leblond",
+      "Kata G\u00e1bor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.590",
+    "point2d": [
+      7.774505615234375,
+      -50.268497467041016
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 592,
+    "title": "GreenKGC: A Lightweight Knowledge Graph Completion Method",
+    "abstract": "Knowledge graph completion (KGC) aims to discover missing relationships between entities in knowledge graphs (KGs). Most prior KGC work focuses on learning embeddings for entities and relations through a simple score function. Yet, a higher-dimensional embedding space is usually required for a better reasoning capability, which leads to larger model size and hinders applicability to real-world problems (e.g., large-scale KGs or mobile/edge computing). A lightweight modularized KGC solution, called GreenKGC, is proposed in this work to address this issue. GreenKGC consists of three modules: representation learning, feature pruning, and decision learning, to extract discriminant KG features and make accurate predictions on missing relationships using classifiers and negative sampling. Experimental results demonstrate that, in low dimensions, GreenKGC can outperform SOTA methods in most datasets. In addition, low-dimensional GreenKGC can achieve competitive or even better performance against high-dimensional models with a much smaller model size.",
+    "authors": [
+      "Yun Cheng Wang",
+      "Xiou Ge",
+      "Bin Wang",
+      "C.-C. Jay Kuo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.591",
+    "point2d": [
+      56.208152770996094,
+      -65.02565002441406
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 593,
+    "title": "Unsupervised Open-domain Keyphrase Generation",
+    "abstract": "In this work, we study the problem of unsupervised open-domain keyphrase generation, where the objective is a keyphrase generation model that can be built without using human-labeled data and can perform consistently across domains. To solve this problem, we propose a seq2seq model that consists of two modules, namely phraseness and informativeness module, both of which can be built in an unsupervised and open-domain fashion. The phraseness module generates phrases, while the informativeness module guides the generation towards those that represent the core concepts of the text. We thoroughly evaluate our proposed method using eight benchmark datasets from different domains. Results on in-domain datasets show that our approach achieves state-of-the-art results compared with existing unsupervised models, and overall narrows the gap between supervised and unsupervised methods down to about 16%. Furthermore, we demonstrate that our model performs consistently across domains, as it surpasses the baselines on out-of-domain datasets.",
+    "authors": [
+      "Lam Do",
+      "Pritom Saha Akash",
+      "Kevin Chen-Chuan Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.592",
+    "point2d": [
+      -20.63445281982422,
+      29.628969192504883
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 594,
+    "title": "A Cognitive Stimulation Dialogue System with Multi-source Knowledge Fusion for Elders with Cognitive Impairment",
+    "abstract": "When communicating with elders with cognitive impairment, cognitive stimulation (CS) help to maintain the cognitive health of elders. Data sparsity is the main challenge in building CS-based dialogue systems, particularly in the Chinese language. To fill this gap, we construct a Chinese CS conversation (CSConv) dataset, which contains about 2.6K groups of dialogues with therapy principles and emotional support strategy labels. Making chit chat while providing emotional support is overlooked by the majority of existing cognitive dialogue systems. In this paper, we propose a multi-source knowledge fusion method for CS dialogue (CSD), to generate open-ended responses guided by the therapy principle and emotional support strategy. We first use a progressive mask method based on external knowledge to learn encoders as effective classifiers, which is the prerequisite to predict the therapy principle and emotional support strategy of the target response. Then a decoder interacts with the perceived therapy principle and emotional support strategy to generate responses. Extensive experiments conducted on the CSConv dataset demonstrate the effectiveness of the proposed method, while there is still a large space for improvement compared to human performance.",
+    "authors": [
+      "Jiyue Jiang",
+      "Sheng Wang",
+      "Qintong Li",
+      "Lingpeng Kong",
+      "Chuan Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.593",
+    "point2d": [
+      27.233314514160156,
+      68.80402374267578
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 595,
+    "title": "Plug-and-Play Knowledge Injection for Pre-trained Language Models",
+    "abstract": "Injecting external knowledge can improve the performance of pre-trained language models (PLMs) on various downstream NLP tasks. However, massive retraining is required to deploy new knowledge injection methods or knowledge bases for downstream tasks. In this work, we are the first to study how to improve the flexibility and efficiency of knowledge injection by reusing existing downstream models. To this end, we explore a new paradigm plug-and-play knowledge injection, where knowledge bases are injected into frozen existing downstream models by a knowledge plugin. Correspondingly, we propose a plug-and-play injection method map-tuning, which trains a mapping of knowledge embeddings to enrich model inputs with mapped embeddings while keeping model parameters frozen. Experimental results on three knowledge-driven NLP tasks show that existing injection methods are not suitable for the new paradigm, while map-tuning effectively improves the performance of downstream models. Moreover, we show that a frozen downstream model can be well adapted to different domains with different mapping networks of domain knowledge. Our code and models are available at https://github.com/THUNLP/Knowledge-Plugin.",
+    "authors": [
+      "Zhengyan Zhang",
+      "Zhiyuan Zeng",
+      "Yankai Lin",
+      "Huadong Wang",
+      "Deming Ye",
+      "Chaojun Xiao",
+      "Xu Han",
+      "Zhiyuan Liu",
+      "Peng Li",
+      "Maosong Sun",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.594",
+    "point2d": [
+      -23.870346069335938,
+      -27.139522552490234
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 596,
+    "title": "Two Birds One Stone: Dynamic Ensemble for OOD Intent Classification",
+    "abstract": "Out-of-domain (OOD) intent classification is an active field of natural language understanding, which is of great practical significance for intelligent devices such as the Task-Oriented Dialogue System. It mainly contains two challenges: it requires the model to know what it knows and what it does not know.This paper investigates \u201coverthinking\u201d in the open-world scenario and its impact on OOD intent classification. Inspired by this, we propose a two-birds-one-stone method, which allows the model to decide whether to make a decision on OOD classification early during inference and can ensure accuracy and accelerate inference. At the same time, to adapt to the behavior of dynamic inference, we also propose a training method based on ensemble methods. In addition to bringing certain theoretical insights, we also conduct detailed experiments on three real-world intent datasets. Compared with the previous baselines, our method can not only improve inference speed, but also achieve significant performance improvements.",
+    "authors": [
+      "Yunhua Zhou",
+      "Jianqiang Yang",
+      "Pengyu Wang",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.595",
+    "point2d": [
+      -6.873082160949707,
+      73.2432632446289
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 597,
+    "title": "SWiPE: A Dataset for Document-Level Simplification of Wikipedia Pages",
+    "abstract": "Text simplification research has mostly focused on sentence-level simplification, even though many desirable edits - such as adding relevant background information or reordering content - may require document-level context.Prior work has also predominantly framed simplification as a single-step, input-to-output task, only implicitly modeling the fine-grained, span-level edits that elucidate the simplification process.To address both gaps, we introduce the SWiPE dataset, which reconstructs the document-level editing process from English Wikipedia (EW) articles to paired Simple Wikipedia (SEW) articles. In contrast to prior work, SWiPE leverages the entire revision history when pairing pages in order to better identify simplification edits. We work with Wikipedia editors to annotate 5,000 EW-SEW document pairs, labeling more than 40,000 edits with proposed 19 categories.To scale our efforts, we propose several models to automatically label edits, achieving an F-1 score of up to 70.9, indicating that this is a tractable but challenging NLU task. Finally, we categorize the edits produced by several simplification models and find that SWiPE-trained models generate more complex edits while reducing unwanted edits.",
+    "authors": [
+      "Philippe Laban",
+      "Jesse Vig",
+      "Wojciech Kryscinski",
+      "Shafiq Joty",
+      "Caiming Xiong",
+      "Chien-Sheng Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.596",
+    "point2d": [
+      -29.228229522705078,
+      30.496978759765625
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 598,
+    "title": "Are Message Passing Neural Networks Really Helpful for Knowledge Graph Completion?",
+    "abstract": "Knowledge graphs (KGs) facilitate a wide variety of applications. Despite great efforts in creation and maintenance, even the largest KGs are far from complete. Hence, KG completion (KGC) has become one of the most crucial tasks for KG research. Recently, considerable literature in this space has centered around the use of Message Passing (Graph) Neural Networks (MPNNs), to learn powerful embeddings. The success of these methods is naturally attributed to the use of MPNNs over simpler multi-layer perceptron (MLP) models, given their additional message passing (MP) component. In this work, we find that surprisingly, simple MLP models are able to achieve comparable performance to MPNNs, suggesting that MP may not be as crucial as previously believed. With further exploration, we show careful scoring function and loss function design has a much stronger influence on KGC model performance. This suggests a conflation of scoring function design, loss function design, and MP in prior work, with promising insights regarding the scalability of state-of-the-art KGC methods today, as well as careful attention to more suitable MP designs for KGC tasks tomorrow.",
+    "authors": [
+      "Juanhui Li",
+      "Harry Shomer",
+      "Jiayuan Ding",
+      "Yiqi Wang",
+      "Yao Ma",
+      "Neil Shah",
+      "Jiliang Tang",
+      "Dawei Yin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.597",
+    "point2d": [
+      55.418216705322266,
+      -66.29796600341797
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 599,
+    "title": "A dynamic programming algorithm for span-based nested named-entity recognition in O(n^2)",
+    "abstract": "Span-based nested named-entity recognition (NER) has a cubic-time complexity using avariant of the CYK algorithm. We show that by adding a supplementary structural constraint on the search space, nested NER has a quadratic-time complexity, that is the same asymptotic complexity than the non-nested case. The proposed algorithm covers a large part of three standard English benchmarks and delivers comparable experimental results.",
+    "authors": [
+      "Caio Corro"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.598",
+    "point2d": [
+      29.649898529052734,
+      -79.60248565673828
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 600,
+    "title": "Target-Side Augmentation for Document-Level Machine Translation",
+    "abstract": "Document-level machine translation faces the challenge of data sparsity due to its long input length and a small amount of training data, increasing the risk of learning spurious patterns. To address this challenge, we propose a target-side augmentation method, introducing a data augmentation (DA) model to generate many potential translations for each source document. Learning on these wider range translations, an MT model can learn a smoothed distribution, thereby reducing the risk of data sparsity. We demonstrate that the DA model, which estimates the posterior distribution, largely improves the MT performance, outperforming the previous best system by 2.30 s-BLEU on News and achieving new state-of-the-art on News and Europarl benchmarks.",
+    "authors": [
+      "Guangsheng Bao",
+      "Zhiyang Teng",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.599",
+    "point2d": [
+      -66.81795501708984,
+      -8.501652717590332
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 601,
+    "title": "Rethinking Masked Language Modeling for Chinese Spelling Correction",
+    "abstract": "In this paper, we study Chinese Spelling Correction (CSC) as a joint decision made by two separate models: a language model and an error model. Through empirical analysis, we find that fine-tuning BERT tends to over-fit the error model while under-fit the language model, resulting in poor generalization to out-of-distribution error patterns. Given that BERT is the backbone of most CSC models, this phenomenon has a significant negative impact. To address this issue, we are releasing a multi-domain benchmark LEMON, with higher quality and diversity than existing benchmarks, to allow a comprehensive assessment of the open domain generalization of CSC models. Then, we demonstrate that a very simple strategy \u2013 randomly masking 20% non-error tokens from the input sequence during fine-tuning \u2013 is sufficient for learning a much better language model without sacrificing the error model. This technique can be applied to any model architecture and achieves new state-of-the-art results on SIGHAN, ECSpell, and LEMON.",
+    "authors": [
+      "Hongqiu Wu",
+      "Shaohua Zhang",
+      "Yuchen Zhang",
+      "Hai Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.600",
+    "point2d": [
+      -39.84279251098633,
+      8.182518005371094
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 602,
+    "title": "A Multi-Modal Context Reasoning Approach for Conditional Inference on Joint Textual and Visual Clues",
+    "abstract": "Conditional inference on joint textual and visual clues is a multi-modal reasoning task that textual clues provide prior permutation or external knowledge, which are complementary with visual content and pivotal to deducing the correct option. Previous methods utilizing pretrained vision-language models (VLMs) have achieved impressive performances, yet they show a lack of multimodal context reasoning capability, especially for text-modal information. To address this issue, we propose a Multi-modal Context Reasoning approach, named ModCR. Compared to VLMs performing reasoning via cross modal semantic alignment, it regards the given textual abstract semantic and objective image information as the pre-context information and embeds them into the language model to perform context reasoning. Different from recent vision-aided language models used in natural language processing, ModCR incorporates the multi-view semantic alignment information between language and vision by introducing the learnable alignment prefix between image and text in the pretrained language model. This makes the language model well-suitable for such multi-modal reasoning scenario on joint textual and visual clues. We conduct extensive experiments on two corresponding data sets and experimental results show significantly improved performance (exact gain by 4.8% on PMR test set) compared to previous strong baselines.",
+    "authors": [
+      "Yunxin Li",
+      "Baotian Hu",
+      "Chen Xinyu",
+      "Yuxin Ding",
+      "Lin Ma",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.601",
+    "point2d": [
+      -49.13843536376953,
+      39.31124496459961
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 603,
+    "title": "Simple and Effective Unsupervised Speech Translation",
+    "abstract": "The amount of labeled data to train models for speech tasks is limited for most languages, however, the data scarcity is exacerbated for speech translation which requires labeled data covering two different languages. To address this issue, we study a simple and effective approach to build speech translation systems without labeled data by leveraging recent advances in unsupervised speech recognition, machine translation and speech synthesis, either in a pipeline approach, or to generate pseudo-labels for training end-to-end speech translation models. Furthermore, we present an unsupervised domain adaptation technique for pre-trained speech models which improves the performance of downstream unsupervised speech recognition, especially for low-resource settings. Experiments show that unsupervised speech-to-text translation outperforms the previous unsupervised state of the art by 3.2 BLEU on the Libri-Trans benchmark, on CoVoST 2, our best systems outperform the best supervised end-to-end models (without pre-training) from only two years ago by an average of 5.0 BLEU over five X-En directions. We also report competitive results on MuST-C and CVSS benchmarks.",
+    "authors": [
+      "Changhan Wang",
+      "Hirofumi Inaguma",
+      "Peng-Jen Chen",
+      "Ilia Kulikov",
+      "Yun Tang",
+      "Wei-Ning Hsu",
+      "Michael Auli",
+      "Juan Pino"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.602",
+    "point2d": [
+      -69.46270751953125,
+      17.417993545532227
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 604,
+    "title": "Modeling What-to-ask and How-to-ask for Answer-unaware Conversational Question Generation",
+    "abstract": "Conversational Question Generation (CQG) is a critical task for machines to assist humans in fulfilling their information needs through conversations. The task is generally cast into two different settings: answer-aware and answer-unaware. While the former facilitates the models by exposing the expected answer, the latter is more realistic and receiving growing attentions recently. What-to-ask and how-to-ask are the two main challenges in the answer-unaware setting. To address the first challenge, existing methods mainly select sequential sentences in context as the rationales. We argue that the conversation generated using such naive heuristics may not be natural enough as in reality, the interlocutors often talk about the relevant contents that are not necessarily sequential in context. Additionally, previous methods decide the type of question to be generated (boolean/span-based) implicitly. Modeling the question type explicitly is crucial as the answer, which hints the models to generate a boolean or span-based question, is unavailable. To this end, we present SG-CQG, a two-stage CQG framework. For the what-to-ask stage, a sentence is selected as the rationale from a semantic graph that we construct, and extract the answer span from it. For the how-to-ask stage, a classifier determines the target answer type of the question via two explicit control signals before generating and filtering. In addition, we propose Conv-Distinct, a novel evaluation metric for CQG, to evaluate the diversity of the generated conversation from a context. Compared with the existing answer-unaware CQG models, the proposed SG-CQG achieves state-of-the-art performance.",
+    "authors": [
+      "Xuan Long Do",
+      "Bowei Zou",
+      "Shafiq Joty",
+      "Tran Tai",
+      "Liangming Pan",
+      "Nancy Chen",
+      "Ai Ti Aw"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.603",
+    "point2d": [
+      19.19688606262207,
+      52.98322677612305
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 605,
+    "title": "CHEER: Centrality-aware High-order Event Reasoning Network for Document-level Event Causality Identification",
+    "abstract": "Document-level Event Causality Identification (DECI) aims to recognize causal relations between events within a document. Recent studies focus on building a document-level graph for cross-sentence reasoning, but ignore important causal structures \u2014 there are one or two \u201ccentral\u201d events that prevail throughout the document, with most other events serving as either their cause or consequence. In this paper, we manually annotate central events for a systematical investigation and propose a novel DECI model, CHEER, which performs high-order reasoning while considering event centrality. First, we summarize a general GNN-based DECI model and provide a unified view for better understanding. Second, we design an Event Interaction Graph (EIG) involving the interactions among events (e.g., coreference) and event pairs, e.g., causal transitivity, cause(A, B) AND cause(B, C) \u2192 cause(A, C). Finally, we incorporate event centrality information into the EIG reasoning network via well-designed features and multi-task learning. We have conducted extensive experiments on two benchmark datasets. The results present great improvements (5.9% F1 gains on average) and demonstrate the effectiveness of each main component.",
+    "authors": [
+      "Meiqi Chen",
+      "Yixin Cao",
+      "Yan Zhang",
+      "Zhiwei Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.604",
+    "point2d": [
+      48.82447814941406,
+      -47.7882194519043
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 606,
+    "title": "f-Divergence Minimization for Sequence-Level Knowledge Distillation",
+    "abstract": "Knowledge distillation (KD) is the process of transferring knowledge from a large model to a small one.It has gained increasing attention in the natural language processing community, driven by the demands of compressing ever-growing language models. In this work, we propose an FDISTILL framework, which formulates sequence-level knowledge distillation as minimizing a generalized f-divergence function. We propose four distilling variants under our framework and show that existing SeqKD and ENGINE approaches are approximations of our FDISTILL methods. We further derive step-wise decomposition for our FDISTILL, reducing intractable sequence-level divergence to word-level losses that can be computed in a tractable manner. Experiments across four datasets show that our methods outperform existing KD approaches, and that our symmetric distilling losses can better force the student to learn from the teacher distribution.",
+    "authors": [
+      "Yuqiao Wen",
+      "Zichao Li",
+      "Wenyu Du",
+      "Lili Mou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.605",
+    "point2d": [
+      -48.89067459106445,
+      -21.035449981689453
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 607,
+    "title": "Supervised Adversarial Contrastive Learning for Emotion Recognition in Conversations",
+    "abstract": "Extracting generalized and robust representations is a major challenge in emotion recognition in conversations (ERC). To address this, we propose a supervised adversarial contrastive learning (SACL) framework for learning class-spread structured representations in a supervised manner. SACL applies contrast-aware adversarial training to generate worst-case samples and uses joint class-spread contrastive learning to extract structured representations. It can effectively utilize label-level feature consistency and retain fine-grained intra-class features. To avoid the negative impact of adversarial perturbations on context-dependent data, we design a contextual adversarial training (CAT) strategy to learn more diverse features from context and enhance the model\u2019s context robustness. Under the framework with CAT, we develop a sequence-based SACL-LSTM to learn label-consistent and context-robust features for ERC. Experiments on three datasets show that SACL-LSTM achieves state-of-the-art performance on ERC. Extended experiments prove the effectiveness of SACL and CAT.",
+    "authors": [
+      "Dou Hu",
+      "Yinan Bao",
+      "Lingwei Wei",
+      "Wei Zhou",
+      "Songlin Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.606",
+    "point2d": [
+      -38.5394287109375,
+      64.87873077392578
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 608,
+    "title": "A Novel Table-to-Graph Generation Approach for Document-Level Joint Entity and Relation Extraction",
+    "abstract": "Document-level relation extraction (DocRE) aims to extract relations among entities within a document, which is crucial for applications like knowledge graph construction. Existing methods usually assume that entities and their mentions are identified beforehand, which falls short of real-world applications. To overcome this limitation, we propose TaG, a novel table-to-graph generation model for joint extractionof entities and relations at document-level. To enhance the learning of task dependencies, TaG induces a latent graph among mentions, with different types of edges indicating different task information, which is further broadcast with a relational graph convolutional network. To alleviate the error propagation problem, we adapt the hierarchical agglomerative clustering algorithm to back-propagate task information at decoding stage. Experiments on the benchmark dataset, DocRED, demonstrate that TaG surpasses previous methods by a large margin and achieves state-of-the-art results.",
+    "authors": [
+      "Ruoyu Zhang",
+      "Yanzeng Li",
+      "Lei Zou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.607",
+    "point2d": [
+      41.42705535888672,
+      -67.92008209228516
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 609,
+    "title": "A Synthetic Data Generation Framework for Grounded Dialogues",
+    "abstract": "Training grounded response generation models often requires a large collection of grounded dialogues. However, it is costly to build such dialogues. In this paper, we present a synthetic data generation framework (SynDG) for grounded dialogues. The generation process utilizes large pre-trained language models and freely available knowledge data (e.g., Wikipedia pages, persona profiles, etc.). The key idea of designing SynDG is to consider dialogue flow and coherence in the generation process. Specifically, given knowledge data, we first heuristically determine a dialogue flow, which is a series of knowledge pieces. Then, we employ T5 to incrementally turn the dialogue flow into a dialogue. To ensure coherence of both the dialogue flow and the synthetic dialogue, we design a two-level filtering strategy, at the flow-level and the utterance-level respectively. Experiments on two public benchmarks show that the synthetic grounded dialogue data produced by our framework is able to significantly boost model performance in both full training data and low-resource scenarios.",
+    "authors": [
+      "Jianzhu Bao",
+      "Rui Wang",
+      "Yasheng Wang",
+      "Aixin Sun",
+      "Yitong Li",
+      "Fei Mi",
+      "Ruifeng Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.608",
+    "point2d": [
+      13.706787109375,
+      63.388423919677734
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 610,
+    "title": "MasakhaPOS: Part-of-Speech Tagging for Typologically Diverse African languages",
+    "abstract": "In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS dataset, we show that choosing the best transfer language(s) in both single-source and multi-source setups greatly improves the POS tagging performance of the target languages, in particular when combined with parameter-fine-tuning methods. Crucially, transferring knowledge from a language that matches the language family and morphosyntactic properties seems to be more effective for POS tagging in unseen languages.",
+    "authors": [
+      "Cheikh M. Bamba Dione",
+      "David Ifeoluwa Adelani",
+      "Peter Nabende",
+      "Jesujoba Alabi",
+      "Thapelo Sindane",
+      "Happy Buzaaba",
+      "Shamsuddeen Hassan Muhammad",
+      "Chris Chinenye Emezue",
+      "Perez Ogayo",
+      "Anuoluwapo Aremu",
+      "Catherine Gitau",
+      "Derguene Mbaye",
+      "Jonathan Mukiibi",
+      "Blessing Sibanda",
+      "Bonaventure F. P. Dossou",
+      "Andiswa Bukula",
+      "Rooweither Mabuya",
+      "Allahsera Auguste Tapo",
+      "Edwin Munkoh-Buabeng",
+      "Victoire Memdjokam Koagne",
+      "Fatoumata Ouoba Kabore",
+      "Amelia Taylor",
+      "Godson Kalipe",
+      "Tebogo Macucwa",
+      "Vukosi Marivate",
+      "Tajuddeen Gwadabe",
+      "Mboning Tchiaze Elvis",
+      "Ikechukwu Onyenwe",
+      "Gratien Atindogbe",
+      "Tolulope Adelani",
+      "Idris Akinade",
+      "Olanrewaju Samuel",
+      "Marien Nahimana",
+      "Th\u00e9og\u00e8ne Musabeyezu",
+      "Emile Niyomutabazi",
+      "Ester Chimhenga",
+      "Kudzai Gotosa",
+      "Patrick Mizha",
+      "Apelete Agbolo",
+      "Seydou Traore",
+      "Chinedu Uchechukwu",
+      "Aliyu Yusuf",
+      "Muhammad Abdullahi",
+      "Dietrich Klakow"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.609",
+    "point2d": [
+      11.996959686279297,
+      -30.40309715270996
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 611,
+    "title": "Semantic Structure Enhanced Event Causality Identification",
+    "abstract": "Event Causality Identification (ECI) aims to identify causal relations between events in unstructured texts.This is a very challenging task, because causal relations are usually expressed by implicit associations between events.Existing methods usually capture such associations by directly modeling the texts with pre-trained language models, which underestimate two kinds of semantic structures vital to the ECI task, namely, event-centric structure and event-associated structure.The former includes important semantic elements related to the events to describe them more precisely, while the latter contains semantic paths between two events to provide possible supports for ECI. In this paper, we study the implicit associations between events by modeling the above explicit semantic structures, and propose a Semantic Structure Integration model (SemSIn).It utilizes a GNN-based event aggregator to integrate the event-centric structure information, and employs an LSTM-based path aggregator to capture the event-associated structure information between two events.Experimental results on three widely used datasets show that SemSIn achieves significant improvements over baseline methods.",
+    "authors": [
+      "Zhilei Hu",
+      "Zixuan Li",
+      "Xiaolong Jin",
+      "Long Bai",
+      "Saiping Guan",
+      "Jiafeng Guo",
+      "Xueqi Cheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.610",
+    "point2d": [
+      49.54427719116211,
+      -47.09806823730469
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 612,
+    "title": "Weakly-Supervised Spoken Video Grounding via Semantic Interaction Learning",
+    "abstract": "The task of spoken video grounding aims to localize moments in videos that are relevant to descriptive spoken queries. However, extracting semantic information from speech and modeling the cross-modal correlation pose two critical challenges. Previous studies solve them by representing spoken queries based on the matched video frames, which require tremendous effort for frame-level labeling. In this work, we investigate weakly-supervised spoken video grounding, i.e., learning to localize moments without expensive temporal annotations. To effectively represent the cross-modal semantics, we propose Semantic Interaction Learning (SIL), a novel framework consisting of the acoustic-semantic pre-training (ASP) and acoustic-visual contrastive learning (AVCL). In ASP, we pre-train an effective encoder for the grounding task with three comprehensive tasks, where the robustness task enhances stability by explicitly capturing the invariance between time- and frequency-domain features, the conciseness task avoids over-smooth attention by compressing long sequence into segments, and the semantic task improves spoken language understanding by modeling the precise semantics. In AVCL, we mine pseudo labels with discriminative sampling strategies and directly strengthen the interaction between speech and video by maximizing their mutual information. Extensive experiments demonstrate the effectiveness and superiority of our method.",
+    "authors": [
+      "Ye Wang",
+      "Wang Lin",
+      "Shengyu Zhang",
+      "Tao Jin",
+      "Linjun Li",
+      "Xize Cheng",
+      "Zhou Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.611",
+    "point2d": [
+      -59.80802536010742,
+      61.32835388183594
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 613,
+    "title": "Rehearsal-free Continual Language Learning via Efficient Parameter Isolation",
+    "abstract": "We study the problem of defying catastrophic forgetting when learning a series of language processing tasks. Compared with previous methods, we emphasize the importance of not caching history tasks\u2019 data, which makes the problem more challenging. Our proposed method applies the parameter isolation strategy. For each task, it allocates a small portion of private parameters and learns them with a shared pre-trained model. To load correct parameters at testing time, we introduce a simple yet effective non-parametric method. Experiments on continual language learning benchmarks show that our method is significantly better than all existing no-data-cache methods, and is comparable (or even better) than those using historical data.",
+    "authors": [
+      "Zhicheng Wang",
+      "Yufang Liu",
+      "Tao Ji",
+      "Xiaoling Wang",
+      "Yuanbin Wu",
+      "Congcong Jiang",
+      "Ye Chao",
+      "Zhencong Han",
+      "Ling Wang",
+      "Xu Shao",
+      "Wenqiu Zeng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.612",
+    "point2d": [
+      -38.683345794677734,
+      -13.95685863494873
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 614,
+    "title": "Label-Aware Hyperbolic Embeddings for Fine-grained Emotion Classification",
+    "abstract": "Fine-grained emotion classification (FEC) is a challenging task. Specifically, FEC needs to handle subtle nuance between labels, which can be complex and confusing. Most existing models only address text classification problem in the euclidean space, which we believe may not be the optimal solution as labels of close semantic (e.g., afraid and terrified) may not be differentiated in such space, which harms the performance. In this paper, we propose HypEmo, a novel framework that can integrate hyperbolic embeddings to improve the FEC task. First, we learn label embeddings in the hyperbolic space to better capture their hierarchical structure, and then our model projects contextualized representations to the hyperbolic space to compute the distance between samples and labels. Experimental results show that incorporating such distance to weight cross entropy loss substantially improve the performance on two benchmark datasets, with around 3% improvement compared to previous state-of-the-art, and could even improve up to 8.6% when the labels are hard to distinguish. Code is available at https://github.com/dinobby/HypEmo.",
+    "authors": [
+      "Chih Yao Chen",
+      "Tun Min Hung",
+      "Yi-Li Hsu",
+      "Lun-Wei Ku"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.613",
+    "point2d": [
+      -39.84645462036133,
+      65.31504821777344
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 615,
+    "title": "Combo of Thinking and Observing for Outside-Knowledge VQA",
+    "abstract": "Outside-knowledge visual question answering is a challenging task that requires both the acquisition and the use of open-ended real-world knowledge. Some existing solutions draw external knowledge into the cross-modality space which overlooks the much vaster textual knowledge in natural-language space, while others transform the image into a text which further fuses with the textual knowledge into the natural-language space and completely abandons the use of visual features. In this paper, we are inspired to constrain the cross-modality space into the same space of natural-language space which makes the visual features preserved directly, and the model still benefits from the vast knowledge in natural-language space. To this end, we propose a novel framework consisting of a multimodal encoder, a textual encoder and an answer decoder. Such structure allows us to introduce more types of knowledge including explicit and implicit multimodal and textual knowledge. Extensive experiments validate the superiority of the proposed method which outperforms the state-of-the-art by 6.17% accuracy. We also conduct comprehensive ablations of each component, and systematically study the roles of varying types of knowledge. Codes and knowledge data are to be released.",
+    "authors": [
+      "Qingyi Si",
+      "Yuchen Mo",
+      "Zheng Lin",
+      "Huishan Ji",
+      "Weiping Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.614",
+    "point2d": [
+      56.864810943603516,
+      3.9648027420043945
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 616,
+    "title": "AMPERE: AMR-Aware Prefix for Generation-Based Event Argument Extraction Model",
+    "abstract": "Event argument extraction (EAE) identifies event arguments and their specific roles for a given event. Recent advancement in generation-based EAE models has shown great performance and generalizability over classification-based models. However, existing generation-based EAE models mostly focus on problem re-formulation and prompt design, without incorporating additional information that has been shown to be effective for classification-based models, such as the abstract meaning representation (AMR) of the input passages. Incorporating such information into generation-based models is challenging due to the heterogeneous nature of the natural language form prevalently used in generation-based models and the structured form of AMRs. In this work, we study strategies to incorporate AMR into generation-based EAE models. We propose AMPERE, which generates AMR-aware prefixes for every layer of the generation model. Thus, the prefix introduces AMR information to the generation-based EAE model and then improves the generation. We also introduce an adjusted copy mechanism to AMPERE to help overcome potential noises brought by the AMR graph. Comprehensive experiments and analyses on ACE2005 and ERE datasets show that AMPERE can get 4% - 10% absolute F1 score improvements with reduced training data and it is in general powerful across different training sizes.",
+    "authors": [
+      "I-Hung Hsu",
+      "Zhiyu Xie",
+      "Kuan-Hao Huang",
+      "Prem Natarajan",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.615",
+    "point2d": [
+      42.510887145996094,
+      -44.34791946411133
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 617,
+    "title": "Your spouse needs professional help: Determining the Contextual Appropriateness of Messages through Modeling Social Relationships",
+    "abstract": "Understanding interpersonal communication requires, in part, understanding the social context and norms in which a message is said. However, current methods for identifying offensive content in such communication largely operate independent of context, with only a few approaches considering community norms or prior conversation as context. Here, we introduce a new approach to identifying inappropriate communication by explicitly modeling the social relationship between the individuals. We introduce a new dataset of contextually-situated judgments of appropriateness and show that large language models can readily incorporate relationship information to accurately identify appropriateness in a given context. Using data from online conversations and movie dialogues, we provide insight into how the relationships themselves function as implicit norms and quantify the degree to which context-sensitivity is needed in different conversation settings. Further, we also demonstrate that contextual-appropriateness judgments are predictive of other social factors expressed in language such as condescension and politeness.",
+    "authors": [
+      "David Jurgens",
+      "Agrima Seth",
+      "Jackson Sargent",
+      "Athena Aghighi",
+      "Michael Geraci"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.616",
+    "point2d": [
+      20.98362159729004,
+      39.08684158325195
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 618,
+    "title": "TART: Improved Few-shot Text Classification Using Task-Adaptive Reference Transformation",
+    "abstract": "Meta-learning has emerged as a trending technique to tackle few-shot text classification and achieve state-of-the-art performance. However, the performance of existing approaches heavily depends on the inter-class variance of the support set. As a result, it can perform well on tasks when the semantics of sampled classes are distinct while failing to differentiate classes with similar semantics. In this paper, we propose a novel Task-Adaptive Reference Transformation (TART) network, aiming to enhance the generalization by transforming the class prototypes to per-class fixed reference points in task-adaptive metric spaces. To further maximize divergence between transformed prototypes in task-adaptive metric spaces, TART introduces a discriminative reference regularization among transformed prototypes. Extensive experiments are conducted on four benchmark datasets and our method demonstrates clear superiority over the state-of-the-art models in all the datasets. In particular, our model surpasses the state-of-the-art method by 7.4% and 5.4% in 1-shot and 5-shot classification on the 20 Newsgroups dataset, respectively.",
+    "authors": [
+      "Shuo Lei",
+      "Xuchao Zhang",
+      "Jianfeng He",
+      "Fanglan Chen",
+      "Chang-Tien Lu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.617",
+    "point2d": [
+      1.4417048692703247,
+      -27.41680908203125
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 619,
+    "title": "How Do In-Context Examples Affect Compositional Generalization?",
+    "abstract": "Compositional generalization\u2013understanding unseen combinations of seen primitives\u2013is an essential reasoning capability in human intelligence.The AI community mainly studies this capability by fine-tuning neural networks on lots of training samples, while it is still unclear whether and how in-context learning\u2013the prevailing few-shot paradigm based on large language models\u2013exhibits compositional generalization.In this paper, we present CoFe, a test suite to investigate in-context compositional generalization.We find that the compositional generalization performance can be easily affected by the selection of in-context examples, thus raising the research question what the key factors are to make good in-context examples for compositional generalization.We study three potential factors: similarity, diversity and complexity. Our systematic experiments indicate that in-context examples should be structurally similar to the test case, diverse from each other, and individually simple.Furthermore, two strong limitations are observed: in-context compositional generalization on fictional words is much weaker than that on commonly used ones; it is still critical that the in-context examples should cover required linguistic structures, even though the backbone model has been pre-trained on large corpus.We hope our analysis would facilitate the understanding and utilization of in-context learning paradigm.",
+    "authors": [
+      "Shengnan An",
+      "Zeqi Lin",
+      "Qiang Fu",
+      "Bei Chen",
+      "Nanning Zheng",
+      "Jian-Guang Lou",
+      "Dongmei Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.618",
+    "point2d": [
+      37.781978607177734,
+      -14.776519775390625
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 620,
+    "title": "Attractive Storyteller: Stylized Visual Storytelling with Unpaired Text",
+    "abstract": "Most research on stylized image captioning aims to generate style-specific captions using unpaired text, and has achieved impressive performance for simple styles like positive and negative. However, unlike previous single-sentence captions whose style is mostly embodied in distinctive words or phrases, real-world styles are likely to be implied at the syntactic and discourse levels. In this work, we introduce a new task of Stylized Visual Storytelling (SVST), which aims to describe a photo stream with stylized stories that are more expressive and attractive. We propose a multitasking memory-augmented framework called StyleVSG, which is jointly trained on factual visual storytelling data and unpaired style corpus, achieving a trade-off between style accuracy and visual relevance. Particularly for unpaired stylized text, StyleVSG learns to reconstruct the stylistic story from roughly parallel visual inputs mined with the CLIP model, avoiding problems caused by random mapping in previous methods. Furthermore, a memory module is designed to preserve the consistency and coherence of generated stories. Experiments show that our method can generate attractive and coherent stories with different styles such as fairy tale, romance, and humor. The overall performance of our StyleVSG surpasses state-of-the-art methods on both automatic and human evaluation metrics.",
+    "authors": [
+      "Dingyi Yang",
+      "Qin Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.619",
+    "point2d": [
+      -57.76424026489258,
+      49.817501068115234
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 621,
+    "title": "Multitask Pretraining with Structured Knowledge for Text-to-SQL Generation",
+    "abstract": "Many machine learning-based low-code or no-code applications involve generating code that interacts with structured knowledge. For example, one of the most studied tasks in this area is generating SQL code from a natural language statement. Prior work shows that incorporating context information from the database schema, such as table and column names, is beneficial to model performance on this task. In this work we present a large pretraining dataset and strategy for learning representations of text, tables, and SQL code that leverages the entire context of the problem. Specifically, we build on existing encoder-decoder architecture by introducing a multitask pretraining framework that complements the unique attributes of our diverse pretraining data. Our work represents the first study on large-scale pretraining of encoder-decoder models for interacting with structured knowledge, and offers a new state-of-the-art foundation model in text-to-SQL generation. We validate our approach with experiments on two SQL tasks, showing improvement over existing methods, including a 1.7 and 2.2 percentage point improvement over prior state-of-the-arts on Spider and CoSQL.",
+    "authors": [
+      "Robert Giaquinto",
+      "Dejiao Zhang",
+      "Benjamin Kleiner",
+      "Yang Li",
+      "Ming Tan",
+      "Parminder Bhatia",
+      "Ramesh Nallapati",
+      "Xiaofei Ma"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.620",
+    "point2d": [
+      -6.945837020874023,
+      -47.622283935546875
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 622,
+    "title": "WSPAlign: Word Alignment Pre-training via Large-Scale Weakly Supervised Span Prediction",
+    "abstract": "Most existing word alignment methods rely on manual alignment datasets or parallel corpora, which limits their usefulness. Here, to mitigate the dependence on manual data, we broaden the source of supervision by relaxing the requirement for correct, fully-aligned, and parallel sentences. Specifically, we make noisy, partially aligned, and non-parallel paragraphs in this paper. We then use such a large-scale weakly-supervised dataset for word alignment pre-training via span prediction. Extensive experiments with various settings empirically demonstrate that our approach, which is named WSPAlign, is an effective and scalable way to pre-train word aligners without manual data. When fine-tuned on standard benchmarks, WSPAlign has set a new state of the art by improving upon the best supervised baseline by 3.3 6.1 points in F1 and 1.5 6.1 points in AER. Furthermore, WSPAlign also achieves competitive performance compared with the corresponding baselines in few-shot, zero-shot and cross-lingual tests, which demonstrates that WSPAlign is potentially more practical for low-resource languages than existing methods.",
+    "authors": [
+      "Qiyu Wu",
+      "Masaaki Nagata",
+      "Yoshimasa Tsuruoka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.621",
+    "point2d": [
+      -57.52265548706055,
+      5.034580230712891
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 623,
+    "title": "Distill or Annotate? Cost-Efficient Fine-Tuning of Compact Models",
+    "abstract": "Fine-tuning large models is highly effective, however, inference can be expensive and produces carbon emissions. Knowledge distillation has been shown to be a practical solution to reduce inference costs, but the distillation process itself requires significant computational resources. Rather than buying or renting GPUs to fine-tune, then distill a large model, an NLP practitioner might instead choose to allocate the available budget to hire annotators and manually label additional fine-tuning data. In this paper, we investigate how to most efficiently use a fixed budget to build a compact model. Through extensive experiments on six diverse tasks, we show that distilling from T5-XXL (11B) to T5-Small (60M) is almost always a cost-efficient strategy compared to annotating more data to directly train a compact model (T5-Small). We further investigate how the optimal budget allocated towards computation varies across scenarios. We will make our code, datasets, annotation cost estimates, and baseline models available as a benchmark to support further work on cost-efficient training of compact models.",
+    "authors": [
+      "Junmo Kang",
+      "Wei Xu",
+      "Alan Ritter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.622",
+    "point2d": [
+      -41.77814483642578,
+      -18.689889907836914
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 624,
+    "title": "OD-RTE: A One-Stage Object Detection Framework for Relational Triple Extraction",
+    "abstract": "The Relational Triple Extraction (RTE) task is a fundamental and essential information extraction task. Recently, the table-filling RTE methods have received lots of attention. Despite their success, they suffer from some inherent problems such as underutilizing regional information of triple. In this work, we treat the RTE task based on table-filling method as an Object Detection task and propose a one-stage Object Detection framework for Relational Triple Extraction (OD-RTE). In this framework, the vertices-based bounding box detection, coupled with auxiliary global relational triple region detection, ensuring that regional information of triple could be fully utilized. Besides, our proposed decoding scheme could extract all types of triples. In addition, the negative sampling strategy of relations in the training stage improves the training efficiency while alleviating the imbalance of positive and negative relations. The experimental results show that 1) OD-RTE achieves the state-of-the-art performance on two widely used datasets (i.e., NYT and WebNLG). 2) Compared with the best performing table-filling method, OD-RTE achieves faster training and inference speed with lower GPU memory usage. To facilitate future research in this area, the codes are publicly available at https://github.com/NingJinzhong/ODRTE.",
+    "authors": [
+      "Jinzhong Ning",
+      "Zhihao Yang",
+      "Yuanyuan Sun",
+      "Zhizheng Wang",
+      "Hongfei Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.623",
+    "point2d": [
+      34.2198371887207,
+      -64.80393981933594
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 625,
+    "title": "I Cast Detect Thoughts: Learning to Converse and Guide with Intents and Theory-of-Mind in Dungeons and Dragons",
+    "abstract": "We propose a novel task, G4C, to study teacher-student natural language interactions in a goal-driven and grounded environment. Dungeons and Dragons (D&D), a role-playing game, provides an ideal setting to investigate such interactions. Here, the Dungeon Master (DM), i.e., the teacher, guides the actions of several players\u2014students, each with their own personas and abilities\u2014to achieve shared goals grounded in a fantasy world. Our approach is to decompose and model these interactions into (1) the DM\u2019s intent to guide players toward a given goal; (2) the DM\u2019s guidance utterance to the players expressing this intent; and (3) a theory-of-mind (ToM) model that anticipates the players\u2019 reaction to the guidance one turn into the future. We develop a novel reinforcement learning (RL) method for training a DM that generates guidance for players by rewarding utterances where the intent matches the ToM-anticipated player actions. Human and automated evaluations show that a DM trained to explicitly model intents and incorporate ToM of the players using RL generates better-quality guidance that is 3x more likely to fulfill the DM\u2019s intent than a vanilla natural language generation (NLG) approach.",
+    "authors": [
+      "Pei Zhou",
+      "Andrew Zhu",
+      "Jennifer Hu",
+      "Jay Pujara",
+      "Xiang Ren",
+      "Chris Callison-Burch",
+      "Yejin Choi",
+      "Prithviraj Ammanabrolu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.624",
+    "point2d": [
+      29.656091690063477,
+      55.08116912841797
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 626,
+    "title": "Multitask Pre-training of Modular Prompt for Chinese Few-Shot Learning",
+    "abstract": "Prompt tuning is a parameter-efficient approach to adapting pre-trained language models to downstream tasks. Although prompt tuning has been shown to match the performance of full model tuning when training data is sufficient, it tends to struggle in few-shot learning settings. In this paper, we present Multi-task Pre-trained Modular Prompt (MP2) to boost prompt tuning for few-shot learning. MP2 is a set of combinable prompts pre-trained on 38 Chinese tasks. On downstream tasks, the pre-trained prompts are selectively activated and combined, leading to strong compositional generalization to unseen tasks. To bridge the gap between pre-training and fine-tuning, we formulate upstream and downstream tasks into a unified machine reading comprehension task. Extensive experiments under two learning paradigms, i.e., gradient descent and black-box tuning, show that MP2 significantly outperforms prompt tuning, full model tuning, and prior prompt pre-training methods in few-shot settings. In addition, we demonstrate that MP2 can achieve surprisingly fast and strong adaptation to downstream tasks by merely learning 8 parameters to combine the pre-trained modular prompts.",
+    "authors": [
+      "Tianxiang Sun",
+      "Zhengfu He",
+      "Qin Zhu",
+      "Xipeng Qiu",
+      "Xuanjing Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.625",
+    "point2d": [
+      -20.63357925415039,
+      -11.65211296081543
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 627,
+    "title": "Is GPT-3 a Good Data Annotator?",
+    "abstract": "Data annotation is the process of labeling data that could be used to train machine learning models. Having high quality annotation is crucial, as it allows the model to learn the relationship between the input data and the desired output. GPT-3, a large-scale language model developed by OpenAI, has demonstrated im- impressive zero- and few-shot performance on a wide range of NLP tasks. It is therefore natural to wonder whether it can be used to effectively annotate data for NLP tasks. In this paper, we evaluate the performance of GPT-3 as a data annotator by comparing it with traditional data annotation methods and analyzing its output on a range of tasks. Through this analysis, we aim to provide insight into the potential of GPT-3 as a general-purpose data annotator in NLP.",
+    "authors": [
+      "Bosheng Ding",
+      "Chengwei Qin",
+      "Linlin Liu",
+      "Yew Ken Chia",
+      "Boyang Li",
+      "Shafiq Joty",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.626",
+    "point2d": [
+      -1.483309268951416,
+      -15.899545669555664
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 628,
+    "title": "Multi-Grained Knowledge Retrieval for End-to-End Task-Oriented Dialog",
+    "abstract": "Retrieving proper domain knowledge from an external database lies at the heart of end-to-end task-oriented dialog systems to generate informative responses. Most existing systems blend knowledge retrieval with response generation and optimize them with direct supervision from reference responses, leading to suboptimal retrieval performance when the knowledge base becomes large-scale. To address this, we propose to decouple knowledge retrieval from response generation and introduce a multi-grained knowledge retriever (MAKER) that includes an entity selector to search for relevant entities and an attribute selector to filter out irrelevant attributes. To train the retriever, we propose a novel distillation objective that derives supervision signals from the response generator. Experiments conducted on three standard benchmarks with both small and large-scale knowledge bases demonstrate that our retriever performs knowledge retrieval more effectively than existing methods. Our code has been made publicly available at https://github.com/18907305772/MAKER.",
+    "authors": [
+      "Fanqi Wan",
+      "Weizhou Shen",
+      "Ke Yang",
+      "Xiaojun Quan",
+      "Wei Bi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.627",
+    "point2d": [
+      17.900510787963867,
+      57.26727294921875
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 629,
+    "title": "Few-shot Event Detection: An Empirical Study and a Unified View",
+    "abstract": "Few-shot event detection (ED) has been widely studied, while this brings noticeable discrepancies, e.g., various motivations, tasks, and experimental settings, that hinder the understanding of models for future progress.This paper presents a thorough empirical study, a unified view of ED models, and a better unified baseline. For fair evaluation, we compare 12 representative methods on three datasets, which are roughly grouped into prompt-based and prototype-based models for detailed analysis. Experiments consistently demonstrate that prompt-based methods, including ChatGPT, still significantly trail prototype-based methods in terms of overall performance. To investigate their superior performance, we break down their design elements along several dimensions and build a unified framework on prototype-based methods. Under such unified view, each prototype-method can be viewed a combination of different modules from these design elements. We further combine all advantageous modules and propose a simple yet effective baseline, which outperforms existing methods by a large margin (e.g., 2.7% F1 gains under low-resource setting).",
+    "authors": [
+      "Yubo Ma",
+      "Zehao Wang",
+      "Yixin Cao",
+      "Aixin Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.628",
+    "point2d": [
+      52.89950942993164,
+      -36.75796127319336
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 630,
+    "title": "How to Plant Trees in Language Models: Data and Architectural Effects on the Emergence of Syntactic Inductive Biases",
+    "abstract": "Accurate syntactic representations are essential for robust generalization in natural language. Recent work has found that pre-training can teach language models to rely on hierarchical syntactic features\u2014as opposed to incorrect linear features\u2014when performing tasks after fine-tuning. We test what aspects of pre-training are important for endowing encoder-decoder Transformers with an inductive bias that favors hierarchical syntactic generalizations. We focus on architectural features (depth, width, and number of parameters), as well as the genre and size of the pre-training corpus, diagnosing inductive biases using two syntactic transformation tasks: question formation and passivization, both in English. We find that the number of parameters alone does not explain hierarchical generalization: model depth plays greater role than model width. We also find that pre-training on simpler language, such as child-directed speech, induces a hierarchical bias using an order-of-magnitude less data than pre-training on more typical datasets based on web text or Wikipedia; this suggests that in cognitively plausible language acquisition settings, neural language models may be more data-efficient than previously thought.",
+    "authors": [
+      "Aaron Mueller",
+      "Tal Linzen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.629",
+    "point2d": [
+      -25.560230255126953,
+      -44.83565139770508
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 631,
+    "title": "ClarifyDelphi: Reinforced Clarification Questions with Defeasibility Rewards for Social and Moral Situations",
+    "abstract": "Context is everything, even in commonsense moral reasoning. Changing contexts can flip the moral judgment of an action; Lying to a friend is wrong in general, but may be morally acceptable if it is intended to protect their life.We present ClarifyDelphi, an interactive system that learns to ask clarification questions (e.g., why did you lie to your friend?) in order to elicit additional salient contexts of a social or moral situation. We posit that questions whose potential answers lead to diverging moral judgments are the most informative. Thus, we propose a reinforcement learning framework with a defeasibility reward that aims to maximize the divergence between moral judgments of hypothetical answers to a question. Human evaluation demonstrates that our system generates more relevant, informative and defeasible questions compared to competitive baselines. Our work is ultimately inspired by studies in cognitive science that have investigated the flexibility in moral cognition (i.e., the diverse contexts in which moral rules can be bent), and we hope that research in this direction can assist both cognitive and computational investigations of moral judgments.",
+    "authors": [
+      "Valentina Pyatkin",
+      "Jena D. Hwang",
+      "Vivek Srikumar",
+      "Ximing Lu",
+      "Liwei Jiang",
+      "Yejin Choi",
+      "Chandra Bhagavatula"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.630",
+    "point2d": [
+      33.00053024291992,
+      43.82396697998047
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 632,
+    "title": "HINT: Hypernetwork Instruction Tuning for Efficient Zero- and Few-Shot Generalisation",
+    "abstract": "Recent NLP models have shown the remarkable ability to effectively generalise \u2018zero-shot\u2019 to new tasks using only natural language instructions as guidance. However, many of these approaches suffer from high computational costs due to their reliance on concatenating lengthy instructions with every input example, resulting in costly reprocessing of the instruction. To avoid this, we introduce Hypernetworks for INstruction Tuning (HINT), which convert task instructions and examples into parameter-efficient modules inserted into an underlying model using a pretrained text encoder, eliminating the need to include instructions in the model input. The hypernetwork in HINT also produces an encoded instruction, which we concatenate with encoded inputs during decoding to further improve performance. HINT models outperform strong state-of-the-art baselines by over 10% when controlling for compute (measured in FLOPs). By converting instructions into modules, HINT models can effectively disregard the length of instructions and few-shot example inputs in terms of compute usage. As a result, HINT can enhance its performance by up to 25% by incorporating additional few-shot data, while utilizing only up to 5% more compute. This combines the strengths of parameter-efficient fine-tuning and in-context learning.",
+    "authors": [
+      "Hamish Ivison",
+      "Akshita Bhagia",
+      "Yizhong Wang",
+      "Hannaneh Hajishirzi",
+      "Matthew Peters"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.631",
+    "point2d": [
+      -21.18181800842285,
+      -18.34870719909668
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 633,
+    "title": "Measuring Inductive Biases of In-Context Learning with Underspecified Demonstrations",
+    "abstract": "In-context learning (ICL) is an important paradigm for adapting large language models (LLMs) to new tasks, but the generalization behavior of ICL remains poorly understood. We investigate the inductive biases of ICL from the perspective of feature bias: which feature ICL is more likely to use given a set of underspecified demonstrations in which two features are equally predictive of the labels. First, we characterize the feature biases of GPT-3 models by constructing underspecified demonstrations from a range of NLP datasets and feature combinations. We find that LLMs exhibit clear feature biases\u2014for example, demonstrating a strong bias to predict labels according to sentiment rather than shallow lexical features, like punctuation. Second, we evaluate the effect of different interventions that are designed to impose an inductive bias in favor of a particular feature, such as adding a natural language instruction or using semantically relevant label words. We find that, while many interventions can influence the learner to prefer a particular feature, it can be difficult to overcome strong prior biases. Overall, our results provide a broader picture of the types of features that ICL may be more likely to exploit and how to impose inductive biases that are better aligned with the intended task.",
+    "authors": [
+      "Chenglei Si",
+      "Dan Friedman",
+      "Nitish Joshi",
+      "Shi Feng",
+      "Danqi Chen",
+      "He He"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.632",
+    "point2d": [
+      -14.056731224060059,
+      -22.7192440032959
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 634,
+    "title": "An Inclusive Notion of Text",
+    "abstract": "Natural language processing (NLP) researchers develop models of grammar, meaning and communication based on written text. Due to task and data differences, what is considered text can vary substantially across studies. A conceptual framework for systematically capturing these differences is lacking. We argue that clarity on the notion of text is crucial for reproducible and generalizable NLP. Towards that goal, we propose common terminology to discuss the production and transformation of textual data, and introduce a two-tier taxonomy of linguistic and non-linguistic elements that are available in textual sources and can be used in NLP modeling. We apply this taxonomy to survey existing work that extends the notion of text beyond the conservative language-centered view. We outline key desiderata and challenges of the emerging inclusive approach to text in NLP, and suggest community-level reporting as a crucial next step to consolidate the discussion.",
+    "authors": [
+      "Ilia Kuznetsov",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.633",
+    "point2d": [
+      34.26819610595703,
+      28.536516189575195
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 635,
+    "title": "AlignScore: Evaluating Factual Consistency with A Unified Alignment Function",
+    "abstract": "Many text generation applications require the generated text to be factually consistent with input information. Automatic evaluation of factual consistency is challenging. Previous work has developed various metrics that often depend on specific functions, such as natural language inference (NLI) or question answering (QA), trained on limited data. Those metrics thus can hardly assess diverse factual inconsistencies (e.g., contradictions, hallucinations) that occur in varying inputs/outputs (e.g., sentences, documents) from different tasks. In this paper, we propose AlignScore, a new holistic metric that applies to a variety of factual inconsistency scenarios as above. AlignScore is based on a general function of information alignment between two arbitrary text pieces. Crucially, we develop a unified training framework of the alignment function by integrating a large diversity of data sources, resulting in 4.7M training examples from 7 well-established tasks (NLI, QA, paraphrasing, fact verification, information retrieval, semantic similarity, and summarization). We conduct extensive experiments on large-scale benchmarks including 22 evaluation datasets, where 19 of the datasets were never seen in the alignment training. AlignScore achieves substantial improvement over a wide range of previous metrics. Moreover, AlignScore (355M parameters) matches or even outperforms metrics based on ChatGPT and GPT-4 that are orders of magnitude larger.",
+    "authors": [
+      "Yuheng Zha",
+      "Yichi Yang",
+      "Ruichen Li",
+      "Zhiting Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.634",
+    "point2d": [
+      26.370750427246094,
+      6.198660373687744
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 636,
+    "title": "Multi-source Semantic Graph-based Multimodal Sarcasm Explanation Generation",
+    "abstract": "Multimodal Sarcasm Explanation (MuSE) is a new yet challenging task, which aims to generate a natural language sentence for a multimodal social post (an image as well as its caption) to explain why it contains sarcasm. Although the existing pioneer study has achieved great success with the BART backbone, it overlooks the gap between the visual feature space and the decoder semantic space, the object-level metadata of the image, as well as the potential external knowledge. To solve these limitations, in this work, we propose a novel mulTi-source sEmantic grAph-based Multimodal sarcasm explanation scheme, named TEAM. In particular, TEAM extracts the object-level semantic meta-data instead of the traditional global visual features from the input image. Meanwhile, TEAM resorts to ConceptNet to obtain the external related knowledge concepts for the input text and the extracted object meta-data. Thereafter, TEAM introduces a multi-source semantic graph that comprehensively characterize the multi-source (i.e., caption, object meta-data, external knowledge) semantic relations to facilitate the sarcasm reasoning. Extensive experiments on a public released dataset MORE verify the superiority of our model over cutting-edge methods.",
+    "authors": [
+      "Liqiang Jing",
+      "Xuemeng Song",
+      "Kun Ouyang",
+      "Mengzhao Jia",
+      "Liqiang Nie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.635",
+    "point2d": [
+      -25.31671142578125,
+      62.27708435058594
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 637,
+    "title": "Counterfactual Active Learning for Out-of-Distribution Generalization",
+    "abstract": "We study the out-of-distribution generalization of active learning that adaptively selects samples for annotation in learning the decision boundary of classification. Our empirical study finds that increasingly annotating seen samples may hardly benefit the generalization. To address the problem, we propose Counterfactual Active Learning (CounterAL) that empowers active learning with counterfactual thinking to bridge the seen samples with unseen cases. In addition to annotating factual samples, CounterAL requires annotators to answer counterfactual questions to construct counterfactual samples for training. To achieve CounterAL, we design a new acquisition strategy that selects the informative factual-counterfactual pairs for annotation; and a new training strategy that pushes the model update to focus on the discrepancy between factual and counterfactual samples. We evaluate CounterAL on multiple public datasets of sentiment analysis and natural language inference. The experiment results show that CounterAL requires fewer acquisition rounds and outperforms existing active learning methods by a large margin in OOD tests with comparable IID performance.",
+    "authors": [
+      "Xun Deng",
+      "Wenjie Wang",
+      "Fuli Feng",
+      "Hanwang Zhang",
+      "Xiangnan He",
+      "Yong Liao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.636",
+    "point2d": [
+      1.325146198272705,
+      -0.9238684177398682
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 638,
+    "title": "Multi-granularity Temporal Question Answering over Knowledge Graphs",
+    "abstract": "Recently, question answering over temporal knowledge graphs (i.e., TKGQA) has been introduced and investigated, in quest of reasoning about dynamic factual knowledge. To foster research on TKGQA, a few datasets have been curated (e.g., CronQuestions and Complex-CronQuestions), and various models have been proposed based on these datasets. Nevertheless, existing efforts overlook the fact that real-life applications of TKGQA also tend to be complex in temporal granularity, i.e., the questions may concern mixed temporal granularities (e.g., both day and month). To overcome the limitation, in this paper, we motivate the notion of multi-granularity temporal question answering over knowledge graphs and present a large scale dataset for multi-granularity TKGQA, namely MultiTQ. To the best of our knowledge, MultiTQis among the first of its kind, and compared with existing datasets on TKGQA, MultiTQfeatures at least two desirable aspects\u2014ample relevant facts and multiple temporal granularities. It is expected to better reflect real-world challenges, and serve as a test bed for TKGQA models. In addition, we propose a competing baseline MultiQA over MultiTQ, which is experimentally demonstrated to be effective in dealing with TKGQA. The data and code are released at https://github.com/czy1999/MultiTQ.",
+    "authors": [
+      "Ziyang Chen",
+      "Jinzhi Liao",
+      "Xiang Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.637",
+    "point2d": [
+      71.91753387451172,
+      -0.27425867319107056
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 639,
+    "title": "A New Aligned Simple German Corpus",
+    "abstract": "\u201cLeichte Sprache\u201d, the German counterpart to Simple English, is a regulated language aiming to facilitate complex written language that would otherwise stay inaccessible to different groups of people.We present a new sentence-aligned monolingual corpus for Simple German \u2013 German. It contains multiple document-aligned sources which we have aligned using automatic sentence-alignment methods.We evaluate our alignments based on a manually labelled subset of aligned documents.The quality of our sentence alignments, as measured by the F1-score, surpasses previous work.We publish the dataset under CC BY-SA and the accompanying code under MIT license.",
+    "authors": [
+      "Vanessa Toborek",
+      "Moritz Busch",
+      "Malte Bo\u00dfert",
+      "Christian Bauckhage",
+      "Pascal Welke"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.638",
+    "point2d": [
+      -34.13274002075195,
+      27.1616268157959
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 640,
+    "title": "Introducing Semantics into Speech Encoders",
+    "abstract": "Recent studies find existing self-supervised speech encoders contain primarily acoustic rather than semantic information. As a result, pipelined supervised automatic speech recognition (ASR) to large language model (LLM) systems achieve state-of-the-art results on semantic spoken language tasks by utilizing rich semantic representations from the LLM. These systems come at the cost of labeled audio transcriptions, which is expensive and time-consuming to obtain. We propose a task-agnostic unsupervised way of incorporating semantic information from LLMs into self-supervised speech encoders without labeled audio transcriptions. By introducing semantics, we improve existing speech encoder spoken language understanding (SLU) performance by over 5% on intent classification (IC), with modest gains in named entity resolution (NER) and slot filling (SF), and spoken question answering (SQA) FF1 score by over 2%. Our approach, which uses no ASR data, achieves similar performance as methods trained on over 100 hours of labeled audio transcripts, demonstrating the feasibility of unsupervised semantic augmentations to existing speech encoders.",
+    "authors": [
+      "Derek Xu",
+      "Shuyan Dong",
+      "Changhan Wang",
+      "Suyoun Kim",
+      "Zhaojiang Lin",
+      "Bing Liu",
+      "Akshat Shrivastava",
+      "Shang-Wen Li",
+      "Liang-Hsuan Tseng",
+      "Guan-Ting Lin",
+      "Alexei Baevski",
+      "Hung-yi Lee",
+      "Yizhou Sun",
+      "Wei Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.639",
+    "point2d": [
+      -4.186102867126465,
+      64.67200469970703
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 641,
+    "title": "Constrained Tuple Extraction with Interaction-Aware Network",
+    "abstract": "Tuples extraction is a fundamental task for information extraction and knowledge graph construction. The extracted tuples are usually represented as knowledge triples consisting of subject, relation, and object. In practice, however, the validity of knowledge triples is associated with and changes with the spatial, temporal, or other kinds of constraints. Motivated by this observation, this paper proposes a constrained tuple extraction (CTE) task to guarantee the validity of knowledge tuples. Formally, the CTE task is to extract constrained tuples from unstructured text, which adds constraints to conventional triples. To this end, we propose an interaction-aware network. Combinatorial interactions among context-specific external features and distinct-granularity internal features are exploited to effectively mine the potential constraints. Moreover, we have built a new dataset containing totally 1,748,826 constrained tuples for training and 3656 ones for evaluation. Experiments on our dataset and the public CaRB dataset demonstrate the superiority of the proposed model. The constructed dataset and the codes are publicly available.",
+    "authors": [
+      "Xiaojun Xue",
+      "Chunxia Zhang",
+      "Tianxiang Xu",
+      "Zhendong Niu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.640",
+    "point2d": [
+      34.30849838256836,
+      -64.77063751220703
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 642,
+    "title": "MultiInstruct: Improving Multi-Modal Zero-Shot Learning via Instruction Tuning",
+    "abstract": "Instruction tuning, a new learning paradigm that fine-tunes pre-trained language models on tasks specified through instructions, has shown promising zero-shot performance on various natural language processing tasks. However, it has yet to be explored for vision and multimodal tasks. In this work, we introduce MultiInstruct, the first multimodal instruction tuning benchmark dataset that consists of 62 diverse multimodal tasks in a unified seq-to-seq format covering 10 broad categories. The tasks are derived from 21 existing open-source datasets and each task is equipped with 5 expert-written instructions. We take OFA as the base pre-trained model for multimodal instruction tuning, and to further improve its zero-shot performance, we explore multiple transfer learning strategies to leverage the large-scale Natural Instructions dataset. Experimental results demonstrate strong zero-shot performance on various unseen multimodal tasks and the benefit of transfer learning from a text-only instruction dataset. We also design a new evaluation metric \u2013 Sensitivity, to evaluate how sensitive the model is to the variety of instructions. Our results indicate that fine-tuning the model on a diverse set of tasks and instructions leads to a reduced sensitivity to variations in instructions for each task.",
+    "authors": [
+      "Zhiyang Xu",
+      "Ying Shen",
+      "Lifu Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.641",
+    "point2d": [
+      -20.7106876373291,
+      -16.04387855529785
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 643,
+    "title": "Single Sequence Prediction over Reasoning Graphs for Multi-hop QA",
+    "abstract": "Recent generative approaches for multi-hop question answering (QA) utilize the fusion-in-decoder method to generate a single sequence output which includes both a final answer and a reasoning path taken to arrive at that answer, such as passage titles and key facts from those passages. While such models can lead to better interpretability and high quantitative scores, they often have difficulty accurately identifying the passages corresponding to key entities in the context, resulting in incorrect passage hops and a lack of faithfulness in the reasoning path. To address this, we propose a single-sequence prediction method over a local reasoning graph that integrates a graph structure connecting key entities in each context passage to relevant subsequent passages for each question. We use a graph neural network to encode this graph structure and fuse the resulting representations into the entity representations of the model. Our experiments show significant improvements in answer exact-match/F1 scores and faithfulness of grounding in the reasoning path on the HotpotQA dataset and achieve state-of-the-art numbers on the Musique dataset with only up to a 4% increase in model parameters.",
+    "authors": [
+      "Gowtham Ramesh",
+      "Makesh Narsimhan Sreedhar",
+      "Junjie Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.642",
+    "point2d": [
+      70.94589233398438,
+      5.291740417480469
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 644,
+    "title": "Contrastive Error Attribution for Finetuned Language Models",
+    "abstract": "Recent work has identified noisy and misannotated data as a core cause of hallucinations and unfaithful outputs in Natural Language Generation (NLG) tasks. Consequently, identifying and removing these examples is a key open challenge in creating reliable NLG systems. In this work, we introduce a framework to identify and remove low-quality training instances that lead to undesirable outputs, such as faithfulness errors in text summarization. We show that existing approaches for error tracing, such as gradient-based influence measures, do not perform reliably for detecting faithfulness errors in NLG datasets. We overcome the drawbacks of existing error tracing methods through a new, contrast-based estimate that compares undesired generations to human-corrected outputs. Our proposed method can achieve a mean average precision of 0.93 at detecting known data errors across synthetic tasks with known ground truth, substantially outperforming existing approaches. Using this approach and re-training models on cleaned data leads to a 70% reduction in entity hallucinations on the NYT dataset and a 55% reduction in semantic errors on the E2E dataset.",
+    "authors": [
+      "Faisal Ladhak",
+      "Esin Durmus",
+      "Tatsunori Hashimoto"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.643",
+    "point2d": [
+      -21.232633590698242,
+      17.050825119018555
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 645,
+    "title": "DARE: Towards Robust Text Explanations in Biomedical and Healthcare Applications",
+    "abstract": "Along with the successful deployment of deep neural networks in several application domains, the need to unravel the black-box nature of these networks has seen a significant increase recently. Several methods have been introduced to provide insight into the inference process of deep neural networks. However, most of these explainability methods have been shown to be brittle in the face of adversarial perturbations of their inputs in the image and generic textual domain. In this work we show that this phenomenon extends to specific and important high stakes domains like biomedical datasets. In particular, we observe that the robustness of explanations should be characterized in terms of the accuracy of the explanation in linking a model\u2019s inputs and its decisions - faithfulness - and its relevance from the perspective of domain experts - plausibility. This is crucial to prevent explanations that are inaccurate but still look convincing in the context of the domain at hand. To this end, we show how to adapt current attribution robustness estimation methods to a given domain, so as to take into account domain-specific plausibility. This results in our DomainAdaptiveAREstimator (DARE) attribution robustness estimator, allowing us to properly characterize the domain-specific robustness of faithful explanations. Next, we provide two methods, adversarial training and FAR training, to mitigate the brittleness characterized by DARE, allowing us to train networks that display robust attributions. Finally, we empirically validate our methods with extensive experiments on three established biomedical benchmarks.",
+    "authors": [
+      "Adam Ivankay",
+      "Mattia Rigotti",
+      "Pascal Frossard"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.644",
+    "point2d": [
+      22.32550621032715,
+      -4.469849586486816
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 646,
+    "title": "Neural Machine Translation for Mathematical Formulae",
+    "abstract": "We tackle the problem of neural machine translation of mathematical formulae between ambiguous presentation languages and unambiguous content languages. Compared to neural machine translation on natural language, mathematical formulae have a much smaller vocabulary and much longer sequences of symbols, while their translation requires extreme precision to satisfy mathematical information needs. In this work, we perform the tasks of translating from LaTeX to Mathematica as well as from LaTeX to semantic LaTeX. While recurrent, recursive, and transformer networks struggle with preserving all contained information, we find that convolutional sequence-to-sequence networks achieve 95.1% and 90.7% exact matches, respectively.",
+    "authors": [
+      "Felix Petersen",
+      "Moritz Schubotz",
+      "Andre Greiner-Petter",
+      "Bela Gipp"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.645",
+    "point2d": [
+      44.189247131347656,
+      -24.14484405517578
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 647,
+    "title": "Query-Efficient Black-Box Red Teaming via Bayesian Optimization",
+    "abstract": "The deployment of large-scale generative models is often restricted by their potential risk of causing harm to users in unpredictable ways. We focus on the problem of black-box red teaming, where a red team generates test cases and interacts with the victim model to discover a diverse set of failures with limited query access. Existing red teaming methods construct test cases based on human supervision or language model (LM) and query all test cases in a brute-force manner without incorporating any information from past evaluations, resulting in a prohibitively large number of queries.To this end, we propose Bayesian red teaming (BRT), novel query-efficient black-box red teaming methods based on Bayesian optimization, which iteratively identify diverse positive test cases leading to model failures by utilizing the pre-defined user input pool and the past evaluations. Experimental results on various user input pools demonstrate that our method consistently finds a significantly larger number of diverse positive test cases under the limited query budget than the baseline methods.The source code is available at https://github.com/snu-mllab/Bayesian-Red-Teaminghttps://github.com/snu-mllab/Bayesian-Red-Teaming.",
+    "authors": [
+      "Deokjae Lee",
+      "JunYeong Lee",
+      "Jung-Woo Ha",
+      "Jin-Hwa Kim",
+      "Sang-Woo Lee",
+      "Hwaran Lee",
+      "Hyun Oh Song"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.646",
+    "point2d": [
+      -16.78272247314453,
+      7.1480255126953125
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 648,
+    "title": "SSD-LM: Semi-autoregressive Simplex-based Diffusion Language Model for Text Generation and Modular Control",
+    "abstract": "Despite the growing success of diffusion models in continuous-valued domains (e.g., images), similar efforts for discrete domains such as text have yet to match the performance of autoregressive language models. In this work, we present SSD-LM\u2014a diffusion-based language model with two key design choices. First, SSD-LM is semi-autoregressive, iteratively generating blocks of text, allowing for flexible output length at decoding time while enabling local bidirectional context updates. Second, it is simplex-based, performing diffusion on the natural vocabulary space rather than a learned latent space, allowing us to incorporate classifier guidance and modular control using off-the-shelf classifiers without any adaptation. We evaluate SSD-LM on unconstrained text generation benchmarks, and show that it matches or outperforms strong autoregressive GPT-2 models across standard quality and diversity metrics, while vastly outperforming diffusion-based baselines. On controlled text generation, SSD-LM also outperforms competitive baselines, with an extra advantage in modularity.",
+    "authors": [
+      "Xiaochuang Han",
+      "Sachin Kumar",
+      "Yulia Tsvetkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.647",
+    "point2d": [
+      -28.52684783935547,
+      8.947359085083008
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 649,
+    "title": "Recall, Expand, and Multi-Candidate Cross-Encode: Fast and Accurate Ultra-Fine Entity Typing",
+    "abstract": "Ultra-fine entity typing (UFET) predicts extremely free-formed types (e.g., president, politician) of a given entity mention (e.g., Joe Biden) in context. State-of-the-art (SOTA) methods use the cross-encoder (CE) based architecture. CE concatenates a mention (and its context) with each type and feeds the pair into a pretrained language model (PLM) to score their relevance. It brings deeper interaction between the mention and the type to reach better performance but has to perform N (the type set size) forward passes to infer all the types of a single mention. CE is therefore very slow in inference when the type set is large (e.g., N=10k for UFET). % Cross-encoder also ignores the correlation between different types.To this end, we propose to perform entity typing in a recall-expand-filter manner. The recall and expansion stages prune the large type set and generate K (typically much smaller than N) most relevant type candidates for each mention. At the filter stage, we use a novel model called {pasted macro \u2018NAME\u2019} to concurrently encode and score all these K candidates in only one forward pass to obtain the final type prediction. We investigate different model options for each stage and conduct extensive experiments to compare each option, experiments show that our method reaches SOTA performance on UFET and is thousands of times faster than the CE-based architecture. We also found our method is very effective in fine-grained (130 types) and coarse-grained (9 types) entity typing. Our code is available at {pasted macro \u2018CODE\u2019}.",
+    "authors": [
+      "Chengyue Jiang",
+      "Wenyang Hui",
+      "Yong Jiang",
+      "Xiaobin Wang",
+      "Pengjun Xie",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.648",
+    "point2d": [
+      40.15659713745117,
+      -83.00886535644531
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 650,
+    "title": "MIR-GAN: Refining Frame-Level Modality-Invariant Representations with Adversarial Network for Audio-Visual Speech Recognition",
+    "abstract": "Audio-visual speech recognition (AVSR) attracts a surge of research interest recently by leveraging multimodal signals to understand human speech. Mainstream approaches addressing this task have developed sophisticated architectures and techniques for multi-modality fusion and representation learning. However, the natural heterogeneity of different modalities causes distribution gap between their representations, making it challenging to fuse them. In this paper, we aim to learn the shared representations across modalities to bridge their gap. Different from existing similar methods on other multimodal tasks like sentiment analysis, we focus on the temporal contextual dependencies considering the sequence-to-sequence task setting of AVSR. In particular, we propose an adversarial network to refine frame-level modality-invariant representations (MIR-GAN), which captures the commonality across modalities to ease the subsequent multimodal fusion process. Extensive experiments on public benchmarks LRS3 and LRS2 show that our approach outperforms the state-of-the-arts.",
+    "authors": [
+      "Yuchen Hu",
+      "Chen Chen",
+      "Ruizhe Li",
+      "Heqing Zou",
+      "Eng Siong Chng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.649",
+    "point2d": [
+      -62.377037048339844,
+      28.87583351135254
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 651,
+    "title": "Understanding Factual Errors in Summarization: Errors, Summarizers, Datasets, Error Detectors",
+    "abstract": "The propensity of abstractive summarization models to make factual errors has been studied extensively, including design of metrics to detect factual errors and annotation of errors in current systems\u2019 outputs. However, the ever-evolving nature of summarization systems, metrics, and annotated benchmarks makes factuality evaluation a moving target, and drawing clear comparisons among metrics has become increasingly difficult. In this work, we aggregate factuality error annotations from nine existing datasets and stratify them according to the underlying summarization model. We compare performance of state-of-the-art factuality metrics, including recent ChatGPT-based metrics, on this stratified benchmark and show that their performance varies significantly across different types of summarization models. Critically, our analysis shows that much of the recent improvement in the factuality detection space has been on summaries from older (pre-Transformer) models instead of more relevant recent summarization models. We further perform a finer-grained analysis per error-type and find similar performance variance across error types for different factuality metrics. Our results show that no one metric is superior in all settings or for all error types, and we provide recommendations for best practices given these insights.",
+    "authors": [
+      "Liyan Tang",
+      "Tanya Goyal",
+      "Alex Fabbri",
+      "Philippe Laban",
+      "Jiacheng Xu",
+      "Semih Yavuz",
+      "Wojciech Kryscinski",
+      "Justin Rousseau",
+      "Greg Durrett"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.650",
+    "point2d": [
+      -6.9957098960876465,
+      45.213191986083984
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 652,
+    "title": "GIFT: Graph-Induced Fine-Tuning for Multi-Party Conversation Understanding",
+    "abstract": "Addressing the issues of who saying what to whom in multi-party conversations (MPCs) has recently attracted a lot of research attention. However, existing methods on MPC understanding typically embed interlocutors and utterances into sequential information flows, or utilize only the superficial of inherent graph structures in MPCs. To this end, we present a plug-and-play and lightweight method named graph-induced fine-tuning (GIFT) which can adapt various Transformer-based pre-trained language models (PLMs) for universal MPC understanding. In detail, the full and equivalent connections among utterances in regular Transformer ignore the sparse but distinctive dependency of an utterance on another in MPCs. To distinguish different relationships between utterances, four types of edges are designed to integrate graph-induced signals into attention mechanisms to refine PLMs originally designed for processing sequential texts. We evaluate GIFT by implementing it into three PLMs, and test the performance on three downstream tasks including addressee recognition, speaker identification and response selection. Experimental results show that GIFT can significantly improve the performance of three PLMs on three downstream tasks and two benchmarks with only 4 additional parameters per encoding layer, achieving new state-of-the-art performance on MPC understanding.",
+    "authors": [
+      "Jia-Chen Gu",
+      "Zhenhua Ling",
+      "Quan Liu",
+      "Cong Liu",
+      "Guoping Hu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.651",
+    "point2d": [
+      3.9807419776916504,
+      64.23416137695312
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 653,
+    "title": "Hybrid Uncertainty Quantification for Selective Text Classification in Ambiguous Tasks",
+    "abstract": "Many text classification tasks are inherently ambiguous, which results in automatic systems having a high risk of making mistakes, in spite of using advanced machine learning models. For example, toxicity detection in user-generated content is a subjective task, and notions of toxicity can be annotated according to a variety of definitions that can be in conflict with one another. Instead of relying solely on automatic solutions, moderation of the most difficult and ambiguous cases can be delegated to human workers. Potential mistakes in automated classification can be identified by using uncertainty estimation (UE) techniques. Although UE is a rapidly growing field within natural language processing, we find that state-of-the-art UE methods estimate only epistemic uncertainty and show poor performance, or under-perform trivial methods for ambiguous tasks such as toxicity detection. We argue that in order to create robust uncertainty estimation methods for ambiguous tasks it is necessary to account also for aleatoric uncertainty. In this paper, we propose a new uncertainty estimation method that combines epistemic and aleatoric UE methods. We show that by using our hybrid method, we can outperform state-of-the-art UE methods for toxicity detection and other ambiguous text classification tasks.",
+    "authors": [
+      "Artem Vazhentsev",
+      "Gleb Kuzmin",
+      "Akim Tsvigun",
+      "Alexander Panchenko",
+      "Maxim Panov",
+      "Mikhail Burtsev",
+      "Artem Shelmanov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.652",
+    "point2d": [
+      27.837209701538086,
+      1.3723217248916626
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 654,
+    "title": "BLOOM+1: Adding Language Support to BLOOM for Zero-Shot Prompting",
+    "abstract": "The BLOOM model is a large publicly available multilingual language model, but its pretraining was limited to 46 languages. To extend the benefits of BLOOM to other languages without incurring prohibitively large costs, it is desirable to adapt BLOOM to new languages not seen during pretraining. In this work, we apply existing language adaptation strategies to BLOOM and benchmark its zero-shot prompting performance on eight new languages in a resource-constrained setting. We find language adaptation to be effective at improving zero-shot performance in new languages. Surprisingly, we find that adapter-based finetuning is more effective than continued pretraining for large models. In addition, we discover that prompting performance is not significantly affected by language specifics, such as the writing system. It is primarily determined by the size of the language adaptation data. We also add new languages to BLOOMZ, which is a multitask finetuned version of BLOOM capable of following task instructions zero-shot. We find including a new language in the multitask fine-tuning mixture to be the most effective method to teach BLOOMZ a new language. We conclude that with sufficient training data language adaptation can generalize well to diverse languages. Our code is available at https://github.com/bigscience-workshop/multilingual-modeling.",
+    "authors": [
+      "Zheng Xin Yong",
+      "Hailey Schoelkopf",
+      "Niklas Muennighoff",
+      "Alham Fikri Aji",
+      "David Ifeoluwa Adelani",
+      "Khalid Almubarak",
+      "M Saiful Bari",
+      "Lintang Sutawika",
+      "Jungo Kasai",
+      "Ahmed Baruwa",
+      "Genta Winata",
+      "Stella Biderman",
+      "Edward Raff",
+      "Dragomir Radev",
+      "Vassilina Nikoulina"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.653",
+    "point2d": [
+      -19.32444953918457,
+      -11.281254768371582
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 655,
+    "title": "Logic-driven Indirect Supervision: An Application to Crisis Counseling",
+    "abstract": "Ensuring the effectiveness of text-based crisis counseling requires observing ongoing conversations and providing feedback, both labor-intensive tasks. Automatic analysis of conversations\u2014at the full chat and utterance levels\u2014may help support counselors and provide better care. While some session-level training data (e.g., rating of patient risk) is often available from counselors, labeling utterances requires expensive post hoc annotation. But the latter can not only provide insights about conversation dynamics, but can also serve to support quality assurance efforts for counselors. In this paper, we examine if inexpensive\u2014and potentially noisy\u2014session-level annotation can help improve label utterances. To this end, we propose a logic-based indirect supervision approach that exploits declaratively stated structural dependencies between both levels of annotation to improve utterance modeling. We show that adding these rules gives an improvement of 3.5% f-score over a strong multi-task baseline for utterance-level predictions. We demonstrate via ablation studies how indirect supervision via logic rules also improves the consistency and robustness of the system.",
+    "authors": [
+      "Mattia Medina Grespan",
+      "Meghan Broadbent",
+      "Xinyao Zhang",
+      "Katherine Axford",
+      "Brent Kious",
+      "Zac Imel",
+      "Vivek Srikumar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.654",
+    "point2d": [
+      32.01272964477539,
+      64.75440216064453
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 656,
+    "title": "Grounding Characters and Places in Narrative Text",
+    "abstract": "Tracking characters and locations throughout a story can help improve the understanding of its plot structure. Prior research has analyzed characters and locations from text independently without grounding characters to their locations in narrative time. Here, we address this gap by proposing a new spatial relationship categorization task. The objective of the task is to assign a spatial relationship category for every character and location co-mention within a window of text, taking into consideration linguistic context, narrative tense, and temporal scope. To this end, we annotate spatial relationships in approximately 2500 book excerpts and train a model using contextual embeddings as features to predict these relationships. When applied to a set of books, this model allows us to test several hypotheses on mobility and domestic space, revealing that protagonists are more mobile than non-central characters and that women as characters tend to occupy more interior space than men. Overall, our work is the first step towards joint modeling and analysis of characters and places in narrative text.",
+    "authors": [
+      "Sandeep Soni",
+      "Amanpreet Sihra",
+      "Elizabeth Evans",
+      "Matthew Wilkens",
+      "David Bamman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.655",
+    "point2d": [
+      36.862823486328125,
+      32.327083587646484
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 657,
+    "title": "From Pretraining Data to Language Models to Downstream Tasks: Tracking the Trails of Political Biases Leading to Unfair NLP Models",
+    "abstract": "Language models (LMs) are pretrained on diverse data sources\u2014news, discussion forums, books, online encyclopedias. A significant portion of this data includes facts and opinions which, on one hand, celebrate democracy and diversity of ideas, and on the other hand are inherently socially biased. Our work develops new methods to (1) measure media biases in LMs trained on such corpora, along social and economic axes, and (2) measure the fairness of downstream NLP models trained on top of politically biased LMs. We focus on hate speech and misinformation detection, aiming to empirically quantify the effects of political (social, economic) biases in pretraining data on the fairness of high-stakes social-oriented tasks. Our findings reveal that pretrained LMs do have political leanings which reinforce the polarization present in pretraining corpora, propagating social biases into hate speech predictions and media biases into misinformation detectors. We discuss the implications of our findings for NLP research and propose future directions to mitigate unfairness.",
+    "authors": [
+      "Shangbin Feng",
+      "Chan Young Park",
+      "Yuhan Liu",
+      "Yulia Tsvetkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.656",
+    "point2d": [
+      25.108728408813477,
+      28.409271240234375
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 658,
+    "title": "SLABERT Talk Pretty One Day: Modeling Second Language Acquisition with BERT",
+    "abstract": "Second language acquisition (SLA) research has extensively studied cross-linguistic transfer, the influence of linguistic structure of a speaker\u2019s native language [L1] on the successful acquisition of a foreign language [L2]. Effects of such transfer can be positive (facilitating acquisition) or negative (impeding acquisition). We find that NLP literature has not given enough attention to the phenomenon of negative transfer. To understand patterns of both positive and negative transfer between L1 and L2, we model sequential second language acquisition in LMs. Further, we build a Mutlilingual Age Ordered CHILDES (MAO-CHILDES)\u2014a dataset consisting of 5 typologically diverse languages, i.e., German, French, Polish, Indonesian, and Japanese\u2014to understand the degree to which native Child-Directed Speech (CDS) [L1] can help or conflict with English language acquisition [L2]. To examine the impact of native CDS, we use the TILT-based cross lingual transfer learning approach established by Papadimitriou and Jurafsky (2020) and find that, as in human SLA, language family distance predicts more negative transfer. Additionally, we find that conversational speech data shows greater facilitation for language acquisition than scripted speech data. Our findings call for further research using our novel Transformer-based SLA models and we would like to encourage it by releasing our code, data, and models.",
+    "authors": [
+      "Aditya Yadavalli",
+      "Alekhya Yadavalli",
+      "Vera Tobin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.657",
+    "point2d": [
+      -56.53339385986328,
+      -17.07928466796875
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 659,
+    "title": "Contrastive Novelty-Augmented Learning: Anticipating Outliers with Large Language Models",
+    "abstract": "In many task settings, text classification models are likely to encounter examples from novel classes on which they cannot predict correctly. Selective prediction, in which models abstain on low-confidence examples, provides a possible solution, but existing models are often overly confident on unseen classes. To remedy this overconfidence, we introduce Contrastive Novelty-Augmented Learning (CoNAL), a two-step method that generates OOD examples representative of novel classes, then trains to decrease confidence on them. First, we generate OOD examples by prompting a large language model twice: we prompt it to enumerate relevant novel classes, then generate examples from each novel class matching the task format. Second, we train a classifier with a novel contrastive objective that encourages lower confidence on generated OOD examples than training examples. When trained with CoNAL, classifiers improve in their ability to detect and abstain on novel class examples over prior methods by an average of 2.3% in terms of accuracy under the accuracy-coverage curve (AUAC) and 5.5% AUROC across 4 NLP datasets, with no cost to in-distribution accuracy.",
+    "authors": [
+      "Albert Xu",
+      "Xiang Ren",
+      "Robin Jia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.658",
+    "point2d": [
+      -3.4169669151306152,
+      -10.201058387756348
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 660,
+    "title": "Learning to Initialize: Can Meta Learning Improve Cross-task Generalization in Prompt Tuning?",
+    "abstract": "Prompt tuning (PT) which only tunes the embeddings of an additional sequence of tokens per task, keeping the pre-trained language model (PLM) frozen, has shown remarkable performance in few-shot learning. Despite this, PT has been shown to rely heavily on good initialization of the prompt embeddings. In this work, we study meta prompt tuning (MPT) to systematically explore how meta-learning can help improve (if it can) cross-task generalization in PT through learning to initialize the prompt embeddings from other relevant tasks. We empirically analyze a representative set of meta learning algorithms in a wide range of adaptation settings with different source/target task configurations on a large set of few-shot tasks. With extensive experiments and analysis, we demonstrate the effectiveness of MPT. We find the improvement to be significant particularly on classification tasks. For other kinds of tasks such as question answering, we observe that while MPT can outperform PT in most cases, it does not always outperform multi-task learning. We further provide an in-depth analysis from the perspective of task similarity.",
+    "authors": [
+      "Chengwei Qin",
+      "Shafiq Joty",
+      "Qian Li",
+      "Ruochen Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.659",
+    "point2d": [
+      -19.241979598999023,
+      -9.366400718688965
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 661,
+    "title": "Rethinking the Role of Scale for In-Context Learning: An Interpretability-based Case Study at 66 Billion Scale",
+    "abstract": "Language models have been shown to perform better with an increase in scale on a wide variety of tasks via the in-context learning paradigm. In this paper, we investigate the hypothesis that the ability of a large language model to in-context learn-perform a task is not uniformly spread across all of its underlying components. Using a 66 billion parameter language model (OPT-66B) across a diverse set of 14 downstream tasks, we find this is indeed the case: ~70% of the attention heads and ~20% of the feed forward networks can be removed with minimal decline in task performance. We find substantial overlap in the set of attention heads (un)important for in-context learning across tasks and number of in-context examples. We also address our hypothesis through a task-agnostic lens, finding that a small set of attention heads in OPT-66B score highly on their ability to perform primitive induction operations associated with in-context learning, namely, prefix matching and copying. These induction heads overlap with task-specific important heads, reinforcing arguments by Olsson et al. (2022) regarding induction head generality to more sophisticated behaviors associated with in-context learning. Overall, our study provides several insights that indicate large language models may be under-trained for in-context learning and opens up questions on how to pre-train language models to more effectively perform in-context learning.",
+    "authors": [
+      "Hritik Bansal",
+      "Karthik Gopalakrishnan",
+      "Saket Dingliwal",
+      "Sravan Bodapati",
+      "Katrin Kirchhoff",
+      "Dan Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.660",
+    "point2d": [
+      -15.68601131439209,
+      -22.80141830444336
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 662,
+    "title": "Question-Answering in a Low-resourced Language: Benchmark Dataset and Models for Tigrinya",
+    "abstract": "Question-Answering (QA) has seen significant advances recently, achieving near human-level performance over some benchmarks. However, these advances focus on high-resourced languages such as English, while the task remains unexplored for most other languages, mainly due to the lack of annotated datasets. This work presents a native QA dataset for an East African language, Tigrinya. The dataset contains 10.6K question-answer pairs spanning 572 paragraphs extracted from 290 news articles on various topics. The dataset construction method is discussed, which is applicable to constructing similar resources for related languages. We present comprehensive experiments and analyses of several resource-efficient approaches to QA, including monolingual, cross-lingual, and multilingual setups, along with comparisons against machine-translated silver data. Our strong baseline models reach 76% in the F1 score, while the estimated human performance is 92%, indicating that the benchmark presents a good challenge for future work. We make the dataset, models, and leaderboard publicly available.",
+    "authors": [
+      "Fitsum Gaim",
+      "Wonsuk Yang",
+      "Hancheol Park",
+      "Jong Park"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.661",
+    "point2d": [
+      66.5560302734375,
+      16.06321144104004
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 663,
+    "title": "ESCOXLM-R: Multilingual Taxonomy-driven Pre-training for the Job Market Domain",
+    "abstract": "The increasing number of benchmarks for Natural Language Processing (NLP) tasks in the computational job market domain highlights the demand for methods that can handle job-related tasks such as skill extraction, skill classification, job title classification, and de-identification. While some approaches have been developed that are specific to the job market domain, there is a lack of generalized, multilingual models and benchmarks for these tasks. In this study, we introduce a language model called ESCOXLM-R, based on XLM-R-large, which uses domain-adaptive pre-training on the European Skills, Competences, Qualifications and Occupations (ESCO) taxonomy, covering 27 languages. The pre-training objectives for ESCOXLM-R include dynamic masked language modeling and a novel additional objective for inducing multilingual taxonomical ESCO relations. We comprehensively evaluate the performance of ESCOXLM-R on 6 sequence labeling and 3 classification tasks in 4 languages and find that it achieves state-of-the-art results on 6 out of 9 datasets. Our analysis reveals that ESCOXLM-R performs better on short spans and outperforms XLM-R-large on entity-level and surface-level span-F1, likely due to ESCO containing short skill and occupation titles, and encoding information on the entity-level.",
+    "authors": [
+      "Mike Zhang",
+      "Rob van der Goot",
+      "Barbara Plank"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.662",
+    "point2d": [
+      -16.007299423217773,
+      -29.3114013671875
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 664,
+    "title": "CITADEL: Conditional Token Interaction via Dynamic Lexical Routing for Efficient and Effective Multi-Vector Retrieval",
+    "abstract": "Multi-vector retrieval methods combine the merits of sparse (e.g. BM25) and dense (e.g. DPR) retrievers and have achieved state-of-the-art performance on various retrieval tasks.These methods, however, are orders of magnitude slower and need much more space to store their indices compared to their single-vector counterparts.In this paper, we unify different multi-vector retrieval models from a token routing viewpoint and propose conditional token interaction via dynamic lexical routing, namely CITADEL, for efficient and effective multi-vector retrieval.CITADEL learns to route different token vectors to the predicted lexical keys such that a query token vector only interacts with document token vectors routed to the same key.This design significantly reduces the computation cost while maintaining high accuracy.Notably, CITADEL achieves the same or slightly better performance than the previous state of the art, ColBERT-v2, on both in-domain (MS MARCO) and out-of-domain (BEIR) evaluations, while being nearly 40 times faster. Source code and data are available at https://github.com/facebookresearch/dpr-scale/tree/citadel.",
+    "authors": [
+      "Minghan Li",
+      "Sheng-Chieh Lin",
+      "Barlas Oguz",
+      "Asish Ghoshal",
+      "Jimmy Lin",
+      "Yashar Mehdad",
+      "Wen-tau Yih",
+      "Xilun Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.663",
+    "point2d": [
+      13.030152320861816,
+      -15.340655326843262
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 665,
+    "title": "MultiCapCLIP: Auto-Encoding Prompts for Zero-Shot Multilingual Visual Captioning",
+    "abstract": "Supervised visual captioning models typically require a large scale of images or videos paired with descriptions in a specific language (i.e., the vision-caption pairs) for training. However, collecting and labeling large-scale datasets is time-consuming and expensive for many scenarios and languages. Therefore, sufficient labeled pairs are usually not available. To deal with the label shortage problem, we present a simple yet effective zero-shot approach MultiCapCLIP that can generate visual captions for different scenarios and languages without any labeled vision-caption pairs of downstream datasets. In the training stage, MultiCapCLIP only requires text data for input. Then it conducts two main steps: 1) retrieving concept prompts that preserve the corresponding domain knowledge of new scenarios; 2) auto-encoding the prompts to learn writing styles to output captions in a desired language. In the testing stage, MultiCapCLIP instead takes visual data as input directly to retrieve the concept prompts to generate the final visual descriptions. The extensive experiments on image and video captioning across four benchmarks and four languages (i.e., English, Chinese, German, and French) confirm the effectiveness of our approach. Compared with state-of-the-art zero-shot and weakly-supervised methods, our method achieves 4.8% and 21.5% absolute improvements in terms of BLEU@4 and CIDEr metrics. Our code is available at https://github.com/yangbang18/MultiCapCLIP.",
+    "authors": [
+      "Bang Yang",
+      "Fenglin Liu",
+      "Xian Wu",
+      "Yaowei Wang",
+      "Xu Sun",
+      "Yuexian Zou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.664",
+    "point2d": [
+      -57.57613754272461,
+      45.98017120361328
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 666,
+    "title": "Transfer and Active Learning for Dissonance Detection: Addressing the Rare-Class Challenge",
+    "abstract": "While transformer-based systems have enabled greater accuracies with fewer training examples, data acquisition obstacles still persist for rare-class tasks \u2013 when the class label is very infrequent (e.g. < 5% of samples). Active learning has in general been proposed to alleviate such challenges, but choice of selection strategy, the criteria by which rare-class examples are chosen, has not been systematically evaluated. Further, transformers enable iterative transfer-learning approaches. We propose and investigate transfer- and active learning solutions to the rare class problem of dissonance detection through utilizing models trained on closely related tasks and the evaluation of acquisition strategies, including a proposed probability-of-rare-class (PRC) approach. We perform these experiments for a specific rare-class problem: collecting language samples of cognitive dissonance from social media. We find that PRC is a simple and effective strategy to guide annotations and ultimately improve model accuracy while transfer-learning in a specific order can improve the cold-start performance of the learner but does not benefit iterations of active learning.",
+    "authors": [
+      "Vasudha Varadarajan",
+      "Swanie Juhng",
+      "Syeda Mahwish",
+      "Xiaoran Liu",
+      "Jonah Luby",
+      "Christian Luhmann",
+      "H. Andrew Schwartz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.665",
+    "point2d": [
+      -3.9240548610687256,
+      -8.721296310424805
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 667,
+    "title": "In-sample Curriculum Learning by Sequence Completion for Natural Language Generation",
+    "abstract": "Curriculum learning has shown promising improvements in multiple domains by training machine learning models from easy samples to hard ones. Previous works which either design rules or train models for scoring the difficulty highly rely on task-specific expertise, and cannot generalize. Inspired by the \u201ceasy-to-hard\u201d intuition, we propose to do in-sample curriculum learning for natural language generation tasks. Our learning strategy starts training the model to generate the last few words, i.e., do sequence completion, and gradually extends to generate the whole output sequence. Comprehensive experiments show that it generalizes well to different tasks and achieves significant improvements over strong baselines.",
+    "authors": [
+      "Qi Jia",
+      "Yizhu Liu",
+      "Haifeng Tang",
+      "Kenny Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.666",
+    "point2d": [
+      -12.170451164245605,
+      -42.12092971801758
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 668,
+    "title": "Product Question Answering in E-Commerce: A Survey",
+    "abstract": "Product question answering (PQA), aiming to automatically provide instant responses to customer\u2019s questions in E-Commerce platforms, has drawn increasing attention in recent years. Compared with typical QA problems, PQA exhibits unique challenges such as the subjectivity and reliability of user-generated contents in E-commerce platforms. Therefore, various problem settings and novel methods have been proposed to capture these special characteristics. In this paper, we aim to systematically review existing research efforts on PQA. Specifically, we categorize PQA studies into four problem settings in terms of the form of provided answers. We analyze the pros and cons, as well as present existing datasets and evaluation protocols for each setting. We further summarize the most significant challenges that characterize PQA from general QA applications and discuss their corresponding solutions. Finally, we conclude this paper by providing the prospect on several future directions.",
+    "authors": [
+      "Yang Deng",
+      "Wenxuan Zhang",
+      "Qian Yu",
+      "Wai Lam"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.667",
+    "point2d": [
+      66.12220001220703,
+      21.081417083740234
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 669,
+    "title": "Towards Domain-Agnostic and Domain-Adaptive Dementia Detection from Spoken Language",
+    "abstract": "Health-related speech datasets are often small and varied in focus. This makes it difficult to leverage them to effectively support healthcare goals. Robust transfer of linguistic features across different datasets orbiting the same goal carries potential to address this concern. To test this hypothesis, we experiment with domain adaptation (DA) techniques on heterogeneous spoken language data to evaluate generalizability across diverse datasets for a common task: dementia detection. We find that adapted models exhibit better performance across conversational and task-oriented datasets. The feature-augmented DA method achieves a 22% increase in accuracy adapting from a conversational to task-specific dataset compared to a jointly trained baseline. This suggests promising capacity of these techniques to allow for productive use of disparate data for a complex spoken language healthcare task.",
+    "authors": [
+      "Shahla Farzana",
+      "Natalie Parde"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.668",
+    "point2d": [
+      28.64527702331543,
+      70.58710479736328
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 670,
+    "title": "Generalizing Backpropagation for Gradient-Based Interpretability",
+    "abstract": "Many popular feature-attribution methods for interpreting deep neural networks rely on computing the gradients of a model\u2019s output with respect to its inputs.While these methods can indicate which input features may be important for the model\u2019s prediction, they reveal little about the inner workings of the model itself.In this paper, we observe that the gradient computation of a model is a special case of a more general formulation using semirings. This observation allows us to generalize the backpropagation algorithm to efficiently compute other interpretable statistics about the gradient graph of a neural network, such as the highest-weighted path and entropy. We implement this generalized algorithm, evaluate it on synthetic datasets to better understand the statistics it computes, and apply it to study BERT\u2019s behavior on the subject\u2013verb number agreement task (SVA). With this method, we (a) validate that the amount of gradient flow through a component of a model reflects its importance to a prediction and (b) for SVA, identify which pathways of the self-attention mechanism are most important.",
+    "authors": [
+      "Kevin Du",
+      "Lucas Torroba Hennigen",
+      "Niklas Stoehr",
+      "Alex Warstadt",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.669",
+    "point2d": [
+      -41.73143005371094,
+      -32.8941535949707
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 671,
+    "title": "UPPAM: A Unified Pre-training Architecture for Political Actor Modeling based on Language",
+    "abstract": "Modeling political actors is at the core of quantitative political science. Existing works have incorporated contextual information to better learn the representation of political actors for specific tasks through graph models. However, they are limited to the structure and objective of training settings and can not be generalized to all politicians and other tasks. In this paper, we propose a Unified Pre-training Architecture for Political Actor Modeling based on language (UPPAM). In UPPAM, we aggregate statements to represent political actors and learn the mapping from languages to representation, instead of learning the representation of particular persons. We further design structure-aware contrastive learning and behavior-driven contrastive learning tasks, to inject multidimensional information in the political context into the mapping. In this framework, we can profile political actors from different aspects and solve various downstream tasks. Experimental results demonstrate the effectiveness and capability of generalization of our method.",
+    "authors": [
+      "Xinyi Mou",
+      "Zhongyu Wei",
+      "Qi Zhang",
+      "Xuanjing Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.670",
+    "point2d": [
+      28.450937271118164,
+      28.159168243408203
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 672,
+    "title": "Generic Temporal Reasoning with Differential Analysis and Explanation",
+    "abstract": "Temporal reasoning is the task of predicting temporal relations of event pairs. While temporal reasoning models can perform reasonably well on in-domain benchmarks, we have little idea of these systems\u2019 generalizability due to existing datasets\u2019 limitations. In this work, we introduce a novel task named TODAY that bridges this gap with temporal differential analysis, which as the name suggests, evaluates whether systems can correctly understand the effect of incremental changes. Specifically, TODAY introduces slight contextual changes for given event pairs, and systems are asked to tell how this subtle contextual change would affect relevant temporal relation distributions. To facilitate learning, TODAY also annotates human explanations. We show that existing models, including GPT-3.5, drop to random guessing on TODAY, suggesting that they heavily rely on spurious information rather than proper reasoning for temporal predictions. On the other hand, we show that TODAY\u2019s supervision style and explanation annotations can be used in joint learning, encouraging models to use more appropriate signals during training and thus outperform across several benchmarks. TODAY can also be used to train models to solicit incidental supervision from noisy sources such as GPT-3.5, thus moving us more toward the goal of generic temporal reasoning systems.",
+    "authors": [
+      "Yu Feng",
+      "Ben Zhou",
+      "Haoyu Wang",
+      "Helen Jin",
+      "Dan Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.671",
+    "point2d": [
+      52.61808395385742,
+      -49.557106018066406
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 673,
+    "title": "Model-Based Simulation for Optimising Smart Reply",
+    "abstract": "Smart Reply (SR) systems present a user with a set of replies, of which one can be selected in place of having to type out a response. To perform well at this task, a system should be able to effectively present the user with a diverse set of options, to maximise the chance that at least one of them conveys the user\u2019s desired response. This is a significant challenge, due to the lack of datasets containing sets of responses to learn from. Resultantly, previous work has focused largely on post-hoc diversification, rather than explicitly learning to predict sets of responses. Motivated by this problem, we present a novel method SimSR, that employs model-based simulation to discover high-value response sets, through simulating possible user responses with a learned world model. Unlike previous approaches, this allows our method to directly optimise the end-goal of SR\u2013maximising the relevance of at least one of the predicted replies. Empirically on two public datasets, when compared to SoTA baselines, our method achieves up to 21% and 18% improvement in ROUGE score and Self-ROUGE score respectively.",
+    "authors": [
+      "Benjamin Towle",
+      "Ke Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.672",
+    "point2d": [
+      18.905033111572266,
+      61.45537567138672
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 674,
+    "title": "Beyond Contrastive Learning: A Variational Generative Model for Multilingual Retrieval",
+    "abstract": "Contrastive learning has been successfully used for retrieval of semantically aligned sentences, but it often requires large batch sizes or careful engineering to work well. In this paper, we instead propose a generative model for learning multilingual text embeddings which can be used to retrieve or score sentence pairs. Our model operates on parallel data in N languages and, through an approximation we introduce, efficiently encourages source separation in this multilingual setting, separating semantic information that is shared between translations from stylistic or language-specific variation. We show careful large-scale comparisons between contrastive and generation-based approaches for learning multilingual text embeddings, a comparison that has not been done to the best of our knowledge despite the popularity of these approaches. We evaluate this method on a suite of tasks including semantic similarity, bitext mining, and cross-lingual question retrieval - the last of which we introduce in this paper. Overall, our model outperforms both a strong contrastive and generative baseline on these tasks.",
+    "authors": [
+      "John Wieting",
+      "Jonathan Clark",
+      "William Cohen",
+      "Graham Neubig",
+      "Taylor Berg-Kirkpatrick"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.673",
+    "point2d": [
+      -13.239373207092285,
+      -31.058473587036133
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 675,
+    "title": "On the Blind Spots of Model-Based Evaluation Metrics for Text Generation",
+    "abstract": "In this work, we explore a useful but often neglected methodology for robustness analysis of text generation evaluation metrics: stress tests with synthetic data. Basically, we design and synthesize a wide range of potential errors and check whether they result in a commensurate drop in the metric scores. We examine a range of recently proposed evaluation metrics based on pretrained language models, for the tasks of open-ended generation, translation, and summarization. Our experiments reveal interesting insensitivities, biases, or even loopholes in existing metrics. For example, we find that BERTScore is confused by truncation errors in summarization, and MAUVE (built on top of GPT-2) is insensitive to errors at the beginning or middle of generations. Further, we investigate the reasons behind these blind spots and suggest practical workarounds for a more reliable evaluation of text generation. We have released our code and data at https://github.com/cloudygoose/blindspot_nlg.",
+    "authors": [
+      "Tianxing He",
+      "Jingyu Zhang",
+      "Tianle Wang",
+      "Sachin Kumar",
+      "Kyunghyun Cho",
+      "James Glass",
+      "Yulia Tsvetkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.674",
+    "point2d": [
+      -19.423118591308594,
+      15.96526050567627
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 676,
+    "title": "Dealing with Semantic Underspecification in Multimodal NLP",
+    "abstract": "Intelligent systems that aim at mastering language as humans do must deal with its semantic underspecification, namely, the possibility for a linguistic signal to convey only part of the information needed for communication to succeed. Consider the usages of the pronoun they, which can leave the gender and number of its referent(s) underspecified. Semantic underspecification is not a bug but a crucial language feature that boosts its storage and processing efficiency. Indeed, human speakers can quickly and effortlessly integrate semantically-underspecified linguistic signals with a wide range of non-linguistic information, e.g., the multimodal context, social or cultural conventions, and shared knowledge. Standard NLP models have, in principle, no or limited access to such extra information, while multimodal systems grounding language into other modalities, such as vision, are naturally equipped to account for this phenomenon. However, we show that they struggle with it, which could negatively affect their performance and lead to harmful consequences when used for applications. In this position paper, we argue that our community should be aware of semantic underspecification if it aims to develop language technology that can successfully interact with human users. We discuss some applications where mastering it is crucial and outline a few directions toward achieving this goal.",
+    "authors": [
+      "Sandro Pezzelle"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.675",
+    "point2d": [
+      10.02132511138916,
+      31.229488372802734
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 677,
+    "title": "Trigger Warning Assignment as a Multi-Label Document Classification Problem",
+    "abstract": "A trigger warning is used to warn people about potentially disturbing content. We introduce trigger warning assignment as a multi-label classification task, create the Webis Trigger Warning Corpus 2022, and with it the first dataset of 1 million fanfiction works from Archive of our Own with up to 36 different warnings per document. To provide a reliable catalog of trigger warnings, we organized 41 million of free-form tags assigned by fanfiction authors into the first comprehensive taxonomy of trigger warnings by mapping them to the 36 institutionally recommended warnings. To determine the best operationalization of trigger warnings, we explore state-of-the-art multi-label models, examining the trade-off between assigning coarse- and fine-grained warnings, open- and closed-set classification, document length, and label confidence. Our models achieve micro-F1 scores of about 0.5, which reveals the difficulty of the task. Tailored representations, long input sequences, and a higher recall on rare warnings would help.",
+    "authors": [
+      "Matti Wiegmann",
+      "Magdalena Wolska",
+      "Christopher Schr\u00f6der",
+      "Ole Borchardt",
+      "Benno Stein",
+      "Martin Potthast"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.676",
+    "point2d": [
+      34.32080841064453,
+      10.79106616973877
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 678,
+    "title": "WhitenedCSE: Whitening-based Contrastive Learning of Sentence Embeddings",
+    "abstract": "This paper presents a whitening-based contrastive learning method for sentence embedding learning (WhitenedCSE), which combines contrastive learning with a novel shuffled group whitening. Generally, contrastive learning pulls distortions of a single sample (i.e., positive samples) close and push negative samples far away, correspondingly facilitating the alignment and uniformity in the feature space. A popular alternative to the \u201cpushing\u201d operation is whitening the feature space, which scatters all the samples for uniformity. Since the whitening and the contrastive learning have large redundancy w.r.t. the uniformity, they are usually used separately and do not easily work together. For the first time, this paper integrates whitening into the contrastive learning scheme and facilitates two benefits. 1) Better uniformity. We find that these two approaches are not totally redundant but actually have some complementarity due to different uniformity mechanism. 2) Better alignment. We randomly divide the feature into multiple groups along the channel axis and perform whitening independently within each group. By shuffling the group division, we derive multiple distortions of a single sample and thus increase the positive sample diversity. Consequently, using multiple positive samples with enhanced diversity further improves contrastive learning due to better alignment. Extensive experiments on seven semantic textual similarity tasks show our method achieves consistent improvement over the contrastive learning baseline and sets new states of the art, e.g., 78.78% (+2.53% based on BERT{pasted macro \u2018BA\u2019}) Spearman correlation on STS tasks.",
+    "authors": [
+      "Wenjie Zhuo",
+      "Yifan Sun",
+      "Xiaohan Wang",
+      "Linchao Zhu",
+      "Yi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.677",
+    "point2d": [
+      4.005205154418945,
+      -35.16714096069336
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 679,
+    "title": "Federated Learning for Semantic Parsing: Task Formulation, Evaluation Setup, New Algorithms",
+    "abstract": "This paper studies a new task of federated learning (FL) for semantic parsing, where multiple clients collaboratively train one global model without sharing their semantic parsing data. By leveraging data from multiple clients, the FL paradigm can be especially beneficial for clients that have little training data to develop a data-hungry neural semantic parser on their own. We propose an evaluation setup to study this task, where we re-purpose widely-used single-domain text-to-SQL datasets as clients to form a realistic heterogeneous FL setting and collaboratively train a global model. As standard FL algorithms suffer from the high client heterogeneity in our realistic setup, we further propose a novel LOss Reduction Adjusted Re-weighting (Lorar) mechanism, which adjusts each client\u2019s contribution to the global model update based on its training loss reduction during each round. Our intuition is that the larger the loss reduction, the further away the current global model is from the client\u2019s local optimum, and the larger weight the client should get. By applying Lorar to three widely adopted FL algorithms (FedAvg, FedOPT and FedProx), we observe that their performance can be improved substantially on average (4%-20% absolute gain under MacroAvg) and that clients with smaller datasets enjoy larger performance gains. In addition, the global model converges faster for almost all the clients.",
+    "authors": [
+      "Tianshu Zhang",
+      "Changchang Liu",
+      "Wei-Han Lee",
+      "Yu Su",
+      "Huan Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.678",
+    "point2d": [
+      -30.768831253051758,
+      -59.39351272583008
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 680,
+    "title": "Causality-Guided Multi-Memory Interaction Network for Multivariate Stock Price Movement Prediction",
+    "abstract": "Over the past few years, we\u2019ve witnessed an enormous interest in stock price movement prediction using AI techniques. In recent literature, auxiliary data has been used to improve prediction accuracy, such as textual news. When predicting a particular stock, we assume that information from other stocks should also be utilized as auxiliary data to enhance performance. In this paper, we propose the Causality-guided Multi-memory Interaction Network (CMIN), a novel end-to-end deep neural network for stock movement prediction which, for the first time, models the multi-modality between financial text data and causality-enhanced stock correlations to achieve higher prediction accuracy. CMIN transforms the basic attention mechanism into Causal Attention by calculating transfer entropy between multivariate stocks in order to avoid attention on spurious correlations. Furthermore, we introduce a fusion mechanism to model the multi-directional interactions through which CMIN learns not only the self-influence but also the interactive influence in information flows representing the interrelationship between text and stock correlations. The effectiveness of the proposed approach is demonstrated by experiments on three real-world datasets collected from the U.S. and Chinese markets, where CMIN outperforms existing models to establish a new state-of-the-art prediction accuracy.",
+    "authors": [
+      "Di Luo",
+      "Weiheng Liao",
+      "Shuqi Li",
+      "Xin Cheng",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.679",
+    "point2d": [
+      35.37466049194336,
+      1.9693539142608643
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 681,
+    "title": "DSRM: Boost Textual Adversarial Training with Distribution Shift Risk Minimization",
+    "abstract": "Adversarial training is one of the best-performing methods in improving the robustness of deep language models. However, robust models come at the cost of high time consumption, as they require multi-step gradient ascents or word substitutions to obtain adversarial samples. In addition, these generated samples are deficient in grammatical quality and semantic consistency, which impairs the effectiveness of adversarial training.To address these problems, we introduce a novel, effective procedure for instead adversarial training with only clean data. Our procedure, distribution shift risk minimization (DSRM), estimates the adversarial loss by perturbing the input data\u2019s probability distribution rather than their embeddings. This formulation results in a robust model that minimizes the expected global loss under adversarial attacks. Our approach requires zero adversarial samples for training and reduces time consumption by up to 70% compared to current best-performing adversarial training methods.Experiments demonstrate that DSRM considerably improves BERT\u2019s resistance to textual adversarial attacks and achieves state-of-the-art robust accuracy on various benchmarks.",
+    "authors": [
+      "SongYang Gao",
+      "Shihan Dou",
+      "Yan Liu",
+      "Xiao Wang",
+      "Qi Zhang",
+      "Zhongyu Wei",
+      "Jin Ma",
+      "Ying Shan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.680",
+    "point2d": [
+      5.1100006103515625,
+      6.0401530265808105
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 682,
+    "title": "A Simple and Flexible Modeling for Mental Disorder Detection by Learning from Clinical Questionnaires",
+    "abstract": "Social media is one of the most highly sought resources for analyzing characteristics of the language by its users. In particular, many researchers utilized various linguistic features of mental health problems from social media. However, existing approaches to detecting mental disorders face critical challenges, such as the scarcity of high-quality data or the trade-off between addressing the complexity of models and presenting interpretable results grounded in expert domain knowledge. To address these challenges, we design a simple but flexible model that preserves domain-based interpretability. We propose a novel approach that captures the semantic meanings directly from the text and compares them to symptom-related descriptions. Experimental results demonstrate that our model outperforms relevant baselines on various mental disorder detection tasks. Our detailed analysis shows that the proposed model is effective at leveraging domain knowledge, transferable to other mental disorders, and providing interpretable detection results.",
+    "authors": [
+      "Hoyun Song",
+      "Jisu Shin",
+      "Huije Lee",
+      "Jong Park"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.681",
+    "point2d": [
+      33.748291015625,
+      73.50569152832031
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 683,
+    "title": "Downstream Datasets Make Surprisingly Good Pretraining Corpora",
+    "abstract": "For most natural language processing tasks, the dominant practice is to finetune large pretrained transformer models (e.g., BERT) using smaller downstream datasets.Despite the success of this approach, it remains unclear to what extent these gainsare attributable to the massive background corpora employed for pretraining versus to the pretraining objectives themselves. This paper introduces a large-scale study of self-pretraining, where the same (downstream) training data is used for both pretraining and finetuning.In experiments addressing both ELECTRA and RoBERTa models and 10 distinct downstream classification datasets, we observe that self-pretraining rivals standard pretraining on the BookWiki corpus (despite using around 10x\u2013500x less data), outperforming the latter on 7 and 5 datasets, respectively.Surprisingly, these task-specific pretrained models often perform well on other tasks,including the GLUE benchmark. Besides classification tasks, self-pretraining also provides benefits on structured output prediction tasks such as span based question answering and commonsense inference, often providing more than 50% of the performance boosts provided by pretraining on the BookWiki corpus. Our results hint that in many scenarios, performance gains attributable to pretraining are driven primarily by the pretraining objective itself and are not always attributable to the use of external pretraining data in massive amounts.These findings are especially relevant in light of concerns about intellectual property and offensive content in web-scale pretraining data.",
+    "authors": [
+      "Kundan Krishna",
+      "Saurabh Garg",
+      "Jeffrey Bigham",
+      "Zachary Lipton"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.682",
+    "point2d": [
+      -29.086275100708008,
+      -27.301429748535156
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 684,
+    "title": "Towards Open-World Product Attribute Mining: A Lightly-Supervised Approach",
+    "abstract": "We present a new task setting for attribute mining on e-commerce products, serving as a practical solution to extract open-world attributes without extensive human intervention. Our supervision comes from a high-quality seed attribute set bootstrapped from existing resources, and we aim to expand the attribute vocabulary of existing seed types, and also to discover any new attribute types automatically. A new dataset is created to support our setting, and our approach Amacer is proposed specifically to tackle the limited supervision. Especially, given that no direct supervision is available for those unseen new attributes, our novel formulation exploits self-supervised heuristic and unsupervised latent attributes, which attains implicit semantic signals as additional supervision by leveraging product context. Experiments suggest that our approach surpasses various baselines by 12 F1, expanding attributes of existing types significantly by up to 12 times, and discovering values from 39% new types.",
+    "authors": [
+      "Liyan Xu",
+      "Chenwei Zhang",
+      "Xian Li",
+      "Jingbo Shang",
+      "Jinho D. Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.683",
+    "point2d": [
+      25.348875045776367,
+      -61.02793884277344
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 685,
+    "title": "XDailyDialog: A Multilingual Parallel Dialogue Corpus",
+    "abstract": "High-quality datasets are significant to the development of dialogue models.However, most existing datasets for open-domain dialogue modeling are limited to a single language.The absence of multilingual open-domain dialog datasets not only limits the research on multilingual or cross-lingual transfer learning, but also hinders the development of robust open-domain dialog systems that can be deployed in other parts of the world.In this paper, we provide a multilingual parallel open-domain dialog dataset, XDailyDialog, to enable researchers to explore the challenging task of multilingual and cross-lingual open-domain dialog. XDailyDialog includes 13K dialogues aligned across 4 languages (52K dialogues and 410K utterances in total). We then propose a dialog generation model, kNN-Chat, which has a novel kNN-search mechanism to support unified response retrieval for monolingual, multilingual, and cross-lingual dialogue. Experiment results show the effectiveness of this framework. We will make XDailyDialog and kNN-Chat publicly available soon.",
+    "authors": [
+      "Zeming Liu",
+      "Ping Nie",
+      "Jie Cai",
+      "Haifeng Wang",
+      "Zheng-Yu Niu",
+      "Peng Zhang",
+      "Mrinmaya Sachan",
+      "Kaiping Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.684",
+    "point2d": [
+      12.089465141296387,
+      67.26970672607422
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 686,
+    "title": "PAL to Lend a Helping Hand: Towards Building an Emotion Adaptive Polite and Empathetic Counseling Conversational Agent",
+    "abstract": "The World Health Organization (WHO) has significantly emphasized the need for mental health care. The social stigma associated with mental illness prevents individuals from addressing their issues and getting assistance. In such a scenario, the relevance of online counseling has increased dramatically. The feelings and attitudes that a client and a counselor express towards each other result in a higher or lower counseling experience. A counselor should be friendly and gain clients\u2019 trust to make them share their problems comfortably. Thus, it is essential for the counselor to adequately comprehend the client\u2019s emotions and ensure client\u2019s welfare, i.e. s/he should adapt and deal with the clients politely and empathetically to provide a pleasant, cordial and personalized experience. Motivated by this, in this work, we attempt to build a novel Polite and empAthetic counseLing conversational agent PAL to lay down the counseling support to substance addict and crime victims. To have client\u2019s emotion-based polite and empathetic responses, two counseling datasets laying down the counseling support to substance addicts and crime victims are annotated. These annotated datasets are used to build PAL in a reinforcement learning framework. A novel reward function is formulated to ensure correct politeness and empathy preferences as per client\u2019s emotions with naturalness and non-repetitiveness in responses. Thorough automatic and human evaluation showcase the usefulness and strength of the designed novel reward function. Our proposed system is scalable and can be easily modified with different modules of preference models as per need.",
+    "authors": [
+      "Kshitij Mishra",
+      "Priyanshu Priya",
+      "Asif Ekbal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.685",
+    "point2d": [
+      30.50237464904785,
+      65.8048095703125
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 687,
+    "title": "Bidirectional Generative Framework for Cross-domain Aspect-based Sentiment Analysis",
+    "abstract": "Cross-domain aspect-based sentiment analysis (ABSA) aims to perform various fine-grained sentiment analysis tasks on a target domain by transferring knowledge from a source domain. Since labeled data only exists in the source domain, a model is expected to bridge the domain gap for tackling cross-domain ABSA. Though domain adaptation methods have proven to be effective, most of them are based on a discriminative model, which needs to be specifically designed for different ABSA tasks. To offer a more general solution, we propose a unified bidirectional generative framework to tackle various cross-domain ABSA tasks. Specifically, our framework trains a generative model in both text-to-label and label-to-text directions. The former transforms each task into a unified format to learn domain-agnostic features, and the latter generates natural sentences from noisy labels for data augmentation, with which a more accurate model can be trained. To investigate the effectiveness and generality of our framework, we conduct extensive experiments on four cross-domain ABSA tasks and present new state-of-the-art results on all tasks. Our data and code are publicly available at https://github.com/DAMO-NLP-SG/BGCA.",
+    "authors": [
+      "Yue Deng",
+      "Wenxuan Zhang",
+      "Sinno Jialin Pan",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.686",
+    "point2d": [
+      15.892141342163086,
+      -32.98583984375
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 688,
+    "title": "Contrastive Decoding: Open-ended Text Generation as Optimization",
+    "abstract": "Given a language model (LM), maximum probability is a poor decoding objective for open-ended generation, because it produces short and repetitive text. On the other hand, sampling can often produce incoherent text that drifts from the original topics. We propose contrastive decoding (CD), a reliable decoding approach that optimizes a contrastive objective subject to a plausibility constraint. The contrastive objective returns the difference between the likelihood under a large LM (called the expert, e.g. OPT-13B) and a small LM (called the amateur, e.g. OPT-125M), and the constraint ensures that the outputs are plausible. CD is inspired by the fact that the failures of larger LMs (e.g., repetition, inco- herence) are even more prevalent in smaller LMs, and that this difference signals which texts should be preferred. CD requires zero additional training, and produces higher quality text than decoding from the larger LM alone. It also works across model scales (OPT-13B and GPT2-1.5B) and significantly outperforms four strong decoding algorithms (e.g., nucleus, top-k) in automatic and human evaluations across wikipedia, news and story domains.",
+    "authors": [
+      "Xiang Lisa Li",
+      "Ari Holtzman",
+      "Daniel Fried",
+      "Percy Liang",
+      "Jason Eisner",
+      "Tatsunori Hashimoto",
+      "Luke Zettlemoyer",
+      "Mike Lewis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.687",
+    "point2d": [
+      -26.59319496154785,
+      14.607501983642578
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 689,
+    "title": "Resolving Indirect Referring Expressions for Entity Selection",
+    "abstract": "Recent advances in language modeling have enabled new conversational systems. In particular, it is often desirable for people to make choices among specified options when using such systems. We address the problem of reference resolution, when people use natural expressions to choose between real world entities. For example, given the choice \u2018Should we make a Simnel cake or a Pandan cake\u00bf a natural response from a non-expert may be indirect: \u2018let\u2019s make the green one\u2018. Reference resolution has been little studied with natural expressions, thus robustly understanding such language has large potential for improving naturalness in dialog, recommendation, and search systems. We create AltEntities (Alternative Entities), a new public dataset of entity pairs and utterances, and develop models for the disambiguation problem. Consisting of 42K indirect referring expressions across three domains, it enables for the first time the study of how large language models can be adapted to this task. We find they achieve 82%-87% accuracy in realistic settings, which while reasonable also invites further advances.",
+    "authors": [
+      "Mohammad Javad Hosseini",
+      "Filip Radlinski",
+      "Silvia Pareti",
+      "Annie Louis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.688",
+    "point2d": [
+      48.18928527832031,
+      5.125797748565674
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 690,
+    "title": "Accelerating Transformer Inference for Translation via Parallel Decoding",
+    "abstract": "Autoregressive decoding limits the efficiency of transformers for Machine Translation (MT). The community proposed specific network architectures and learning-based methods to solve this issue, which are expensive and require changes to the MT model, trading inference speed at the cost of the translation quality. In this paper, we propose to address the problem from the point of view of decoding algorithms, as a less explored but rather compelling direction. We propose to reframe the standard greedy autoregressive decoding of MT with a parallel formulation leveraging Jacobi and Gauss-Seidel fixed-point iteration methods for fast inference.This formulation allows to speed up existing models without training or modifications while retaining translation quality. We present three parallel decoding algorithms and test them on different languages and models showing how the parallelization introduces a speedup up to 38% w.r.t. the standard autoregressive decoding and nearly 2x when scaling the method on parallel resources. Finally, we introduce a decoding dependency graph visualizer (DDGviz) that let us see how the model has learned the conditional dependence between tokens and inspect the decoding procedure.",
+    "authors": [
+      "Andrea Santilli",
+      "Silvio Severino",
+      "Emilian Postolache",
+      "Valentino Maiorca",
+      "Michele Mancusi",
+      "Riccardo Marin",
+      "Emanuele Rodola"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.689",
+    "point2d": [
+      -58.49405288696289,
+      -7.38716983795166
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 691,
+    "title": "Hard Sample Aware Prompt-Tuning",
+    "abstract": "Prompt-tuning based few-shot learning has garnered increasing attention in recent years due to its efficiency and promising capability. To achieve the best performance for NLP tasks with just a few samples, it is vital to include as many informative samples as possible and to avoid misleading ones. However, there is no work in prompt-tuning literature addressing the problem of differentiating informative hard samples from misleading ones in model training, which is challenging due to the lack of supervision signals about the quality of the samples to train a well-performed model. We propose a Hard Sample Aware Prompt-Tuning framework (i.e. HardPT) to solve the non-differentiable problem in hard sample identification with reinforcement learning, and to strengthen the discrimination of the feature space without changing the original data distribution via an adaptive contrastive learning method. An extensive empirical study on a series of NLP tasks demonstrates the capability of HardPT in few-shot scenarios. HardPT obtains new SOTA results on all evaluated NLP tasks, including pushing the SST-5 accuracy to 49.5% (1.1% point absolute improvement), QNLI accuracy to 74.6% (1.9% absolute improvement), NMLI accuracy to 71.5 (0.7% absolute improvement), TACREV F_1-score to 28.2 (1.0 absolute improvement), and i2b2/VA F_1-score to 41.2 (1.3 absolute improvement).",
+    "authors": [
+      "Yuanjian Xu",
+      "Qi An",
+      "Jiahuan Zhang",
+      "Peng Li",
+      "Zaiqing Nie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.690",
+    "point2d": [
+      -16.773263931274414,
+      -5.466028690338135
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 692,
+    "title": "WikiBio: a Semantic Resource for the Intersectional Analysis of Biographical Events",
+    "abstract": "Biographical event detection is a relevant task that allows for the exploration and comparison of the ways in which people\u2019s lives are told and represented. This may support several real-life applications in digital humanities and in works aimed at exploring bias about minoritized groups. Despite that, there are no corpora and models specifically designed for this task. In this paper we fill this gap by presenting a new corpus annotated for biographical event detection. The corpus, which includes 20 Wikipedia biographies, was aligned with 5 existing corpora in order to train a model for the biographical event detection task. The model was able to detect all mentions of the target-entity in a biography with an F-score of 0.808 and the entity-related events with an F-score of 0.859. Finally, the model was used for performing an analysis of biases about women and non-Western people in Wikipedia biographies.",
+    "authors": [
+      "Marco Antonio Stranisci",
+      "Rossana Damiano",
+      "Enrico Mensa",
+      "Viviana Patti",
+      "Daniele Radicioni",
+      "Tommaso Caselli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.691",
+    "point2d": [
+      35.19675827026367,
+      31.71709632873535
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 693,
+    "title": "Best-k Search Algorithm for Neural Text Generation",
+    "abstract": "Modern natural language generation paradigms require a decoding strategy to obtain quality sequences out of the model. Beam search yields high-quality but low diversity outputs; stochastic approaches suffer from high variance and sometimes low quality. In this work, we propose a deterministic search algorithm balancing both quality and diversity. We first investigate the vanilla best-first search (BFS) algorithm and then propose the best-k search algorithm. Inspired by BFS, we greedily expand the top k nodes, instead of the first node, to boost efficiency and diversity. Upweighting recently discovered nodes accompanied by heap pruning ensures the completeness of the search procedure. Experiments on four NLG tasks show that best-k search yields more diverse and natural outputs compared to strong baselines, while our approach maintains high text quality. The proposed algorithm is parameter-free, lightweight, efficient, and easy-to-use.",
+    "authors": [
+      "Jiacheng Xu",
+      "Caiming Xiong",
+      "Silvio Savarese",
+      "Yingbo Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.692",
+    "point2d": [
+      -28.726593017578125,
+      15.130454063415527
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 694,
+    "title": "Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages",
+    "abstract": "Building Natural Language Understanding (NLU) capabilities for Indic languages, which have a collective speaker base of more than one billion speakers is absolutely crucial. In this work, we aim to improve the NLU capabilities of Indic languages by making contributions along 3 important axes (i) monolingual corpora (ii) NLU testsets (iii) multilingual LLMs focusing on Indic languages. Specifically, we curate the largest monolingual corpora, IndicCorp, with 20.9B tokens covering 24 languages from 4 language families - a 2.3x increase over prior work, while supporting 12 additional languages. Next, we create a human-supervised benchmark, IndicXTREME, consisting of nine diverse NLU tasks covering 20 languages. Across languages and tasks, IndicXTREME contains a total of 105 evaluation sets, of which 52 are new contributions to the literature. To the best of our knowledge, this is the first effort towards creating a standard benchmark for Indic languages that aims to test the multilingual zero-shot capabilities of pretrained language models. Finally, we train IndicBERT v2, a state-of-the-art model supporting all the languages. Averaged across languages and tasks, the model achieves an absolute improvement of 2 points over a strong baseline. The data and models are available at https://github.com/AI4Bharat/IndicBERT.",
+    "authors": [
+      "Sumanth Doddapaneni",
+      "Rahul Aralikatte",
+      "Gowtham Ramesh",
+      "Shreya Goyal",
+      "Mitesh M. Khapra",
+      "Anoop Kunchukuttan",
+      "Pratyush Kumar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.693",
+    "point2d": [
+      -51.553531646728516,
+      -7.560595989227295
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 695,
+    "title": "Transforming Visual Scene Graphs to Image Captions",
+    "abstract": "We propose to TransForm Scene Graphs into more descriptive Captions (TFSGC). In TFSGC, we apply multi-head attention (MHA) to design the Graph Neural Network (GNN) for embedding scene graphs. After embedding, different graph embeddings contain diverse specific knowledge for generating the words with different part-of-speech, e.g., object/attribute embedding is good for generating nouns/adjectives. Motivated by this, we design a Mixture-of-Expert (MOE)-based decoder, where each expert is built on MHA, for discriminating the graph embeddings to generate different kinds of words. Since both the encoder and decoder are built based on the MHA, as a result, we construct a simple and homogeneous encoder-decoder unlike the previous heterogeneous ones which usually apply Fully-Connected-based GNN and LSTM-based decoder. The homogeneous architecture enables us to unify the training configuration of the whole model instead of specifying different training strategies for diverse sub-networks as in the heterogeneous pipeline, which releases the training difficulty. Extensive experiments on the MS-COCO captioning benchmark validate the effectiveness of our TFSGC. The code is in: https://anonymous.4open.science/r/ACL23_TFSGC.",
+    "authors": [
+      "Xu Yang",
+      "Jiawei Peng",
+      "Zihua Wang",
+      "Haiyang Xu",
+      "Qinghao Ye",
+      "Chenliang Li",
+      "Songfang Huang",
+      "Fei Huang",
+      "Zhangzikang Li",
+      "Yu Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.694",
+    "point2d": [
+      -54.426475524902344,
+      45.74577713012695
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 696,
+    "title": "Hybrid Transducer and Attention based Encoder-Decoder Modeling for Speech-to-Text Tasks",
+    "abstract": "Transducer and Attention based Encoder-Decoder (AED) are two widely used frameworks for speech-to-text tasks. They are designed for different purposes and each has its own benefits and drawbacks for speech-to-text tasks. In order to leverage strengths of both modeling methods, we propose a solution by combining Transducer and Attention based Encoder-Decoder (TAED) for speech-to-text tasks. The new method leverages AED\u2019s strength in non-monotonic sequence to sequence learning while retaining Transducer\u2019s streaming property. In the proposed framework, Transducer and AED share the same speech encoder. The predictor in Transducer is replaced by the decoder in the AED model, and the outputs of the decoder are conditioned on the speech inputs instead of outputs from an unconditioned language model. The proposed solution ensures that the model is optimized by covering all possible read/write scenarios and creates a matched environment for streaming applications. We evaluate the proposed approach on the MuST-C dataset and the findings demonstrate that TAED performs significantly better than Transducer for offline automatic speech recognition (ASR) and speech-to-text translation (ST) tasks. In the streaming case, TAED outperforms Transducer in the ASR task and one ST direction while comparable results are achieved in another translation direction.",
+    "authors": [
+      "Yun Tang",
+      "Anna Sun",
+      "Hirofumi Inaguma",
+      "Xinyue Chen",
+      "Ning Dong",
+      "Xutai Ma",
+      "Paden Tomasello",
+      "Juan Pino"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.695",
+    "point2d": [
+      -64.28399658203125,
+      17.10199546813965
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 697,
+    "title": "Improving Domain Generalization for Prompt-Aware Essay Scoring via Disentangled Representation Learning",
+    "abstract": "Automated Essay Scoring (AES) aims to score essays written in response to specific prompts.Many AES models have been proposed, but most of them are either prompt-specific or prompt-adaptive and cannot generalize well on \u201cunseen\u201d prompts. This work focuses on improving the generalization ability of AES models from the perspective of domain generalization, where the data of target prompts cannot be accessed during training. Specifically, we propose a prompt-aware neural AES model to extract comprehensive representation for essay scoring, including both prompt-invariant and prompt-specific features.To improve the generalization of representation, we further propose a novel disentangled representation learning framework.In this framework, a contrastive norm-angular alignment strategy and a counterfactual self-training strategy are designed to disentangle the prompt-invariant information and prompt-specific information in representation. Extensive experimental results on datasets of both ASAP and TOEFL11 demonstrate the effectiveness of our method under the domain generalization setting.",
+    "authors": [
+      "Zhiwei Jiang",
+      "Tianyi Gao",
+      "Yafeng Yin",
+      "Meng Liu",
+      "Hua Yu",
+      "Zifeng Cheng",
+      "Qing Gu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.696",
+    "point2d": [
+      -13.987245559692383,
+      24.978893280029297
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 698,
+    "title": "What\u2019s the Meaning of Superhuman Performance in Today\u2019s NLU?",
+    "abstract": "In the last five years, there has been a significant focus in Natural Language Processing (NLP) on developing larger Pretrained Language Models (PLMs) and introducing benchmarks such as SuperGLUE and SQuAD to measure their abilities in language understanding, reasoning, and reading comprehension. These PLMs have achieved impressive results on these benchmarks, even surpassing human performance in some cases. This has led to claims of superhuman capabilities and the provocative idea that certain tasks have been solved. In this position paper, we take a critical look at these claims and ask whether PLMs truly have superhuman abilities and what the current benchmarks are really evaluating. We show that these benchmarks have serious limitations affecting the comparison between humans and PLMs and provide recommendations for fairer and more transparent benchmarks.",
+    "authors": [
+      "Simone Tedeschi",
+      "Johan Bos",
+      "Thierry Declerck",
+      "Jan Haji\u010d",
+      "Daniel Hershcovich",
+      "Eduard Hovy",
+      "Alexander Koller",
+      "Simon Krek",
+      "Steven Schockaert",
+      "Rico Sennrich",
+      "Ekaterina Shutova",
+      "Roberto Navigli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.697",
+    "point2d": [
+      -25.830537796020508,
+      -35.378787994384766
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 699,
+    "title": "PromptNER: Prompt Locating and Typing for Named Entity Recognition",
+    "abstract": "Prompt learning is a new paradigm for utilizing pre-trained language models and has achieved great success in many tasks. To adopt prompt learning in the NER task, two kinds of methods have been explored from a pair of symmetric perspectives, populating the template by enumerating spans to predict their entity types or constructing type-specific prompts to locate entities. However, these methods not only require a multi-round prompting manner with a high time overhead and computational cost, but also require elaborate prompt templates, that are difficult to apply in practical scenarios. In this paper, we unify entity locating and entity typing into prompt learning, and design a dual-slot multi-prompt template with the position slot and type slot to prompt locating and typing respectively. Multiple prompts can be input to the model simultaneously, and then the model extracts all entities by parallel predictions on the slots. To assign labels for the slots during training, we design a dynamic template filling mechanism that uses the extended bipartite graph matching between prompts and the ground-truth entities. We conduct experiments in various settings, including resource-rich flat and nested NER datasets and low-resource in-domain and cross-domain datasets. Experimental results show that the proposed model achieves a significant performance improvement, especially in the cross-domain few-shot setting, which outperforms the state-of-the-art model by +7.7% on average.",
+    "authors": [
+      "Yongliang Shen",
+      "Zeqi Tan",
+      "Shuhui Wu",
+      "Wenqi Zhang",
+      "Rongsheng Zhang",
+      "Yadong Xi",
+      "Weiming Lu",
+      "Yueting Zhuang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.698",
+    "point2d": [
+      33.04985809326172,
+      -84.38736724853516
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 700,
+    "title": "Hints on the data for language modeling of synthetic languages with transformers",
+    "abstract": "Language Models (LM) are becoming more and more useful for providing representations upon which to train Natural Language Processing applications. However, there is now clear evidence that attention-based transformers require a critical amount of language data to produce good enough LMs. The question we have addressed in this paper is to what extent the critical amount of data varies for languages of different morphological typology, in particular those that have a rich inflectional morphology, and whether the tokenization method to preprocess the data can make a difference. These details can be important for low-resourced languages that need to plan the production of datasets. We evaluated intrinsically and extrinsically the differences of five different languages with different pretraining dataset sizes and three different tokenization methods for each. The results confirm that the size of the vocabulary due to morphological characteristics is directly correlated with both the LM perplexity and the performance of two typical downstream tasks such as NER identification and POS labeling. The experiments also provide new evidence that a canonical tokenizer can reduce perplexity by more than a half for a polysynthetic language like Quechua as well as raising F1 from 0.8 to more than 0.9 in both downstream tasks with a LM trained with only 6M tokens.",
+    "authors": [
+      "Rodolfo Zevallos",
+      "Nuria Bel"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.699",
+    "point2d": [
+      -36.15933609008789,
+      -38.180599212646484
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 701,
+    "title": "Neural Machine Translation Methods for Translating Text to Sign Language Glosses",
+    "abstract": "State-of-the-art techniques common to low resource Machine Translation (MT) are applied to improve MT of spoken language text to Sign Language (SL) glosses. In our experiments, we improve the performance of the transformer-based models via (1) data augmentation, (2) semi-supervised Neural Machine Translation (NMT), (3) transfer learning and (4) multilingual NMT. The proposed methods are implemented progressively on two German SL corpora containing gloss annotations. Multilingual NMT combined with data augmentation appear to be the most successful setting, yielding statistically significant improvements as measured by three automatic metrics (up to over 6 points BLEU), and confirmed via human evaluation. Our best setting outperforms all previous work that report on the same test-set and is also confirmed on a corpus of the American Sign Language (ASL).",
+    "authors": [
+      "Dele Zhu",
+      "Vera Czehmann",
+      "Eleftherios Avramidis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.700",
+    "point2d": [
+      -80.94120788574219,
+      21.041622161865234
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 702,
+    "title": "Revisiting Event Argument Extraction: Can EAE Models Learn Better When Being Aware of Event Co-occurrences?",
+    "abstract": "Event co-occurrences have been proved effective for event extraction (EE) in previous studies, but have not been considered for event argument extraction (EAE) recently. In this paper, we try to fill this gap between EE research and EAE research, by highlighting the question that \u201cCan EAE models learn better when being aware of event co-occurrences?\u201d. To answer this question, we reformulate EAE as a problem of table generation and extend a SOTA prompt-based EAE model into a non-autoregressive generation framework, called TabEAE, which is able to extract the arguments of multiple events in parallel. Under this framework, we experiment with 3 different training-inference schemes on 4 datasets (ACE05, RAMS, WikiEvents and MLEE) and discover that via training the model to extract all events in parallel, it can better distinguish the semantic boundary of each event and its ability to extract single event gets substantially improved. Experimental results show that our method achieves new state-of-the-art performance on the 4 datasets. Our code is avilable at https://github.com/Stardust-hyx/TabEAE.",
+    "authors": [
+      "Yuxin He",
+      "Jingyue Hu",
+      "Buzhou Tang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.701",
+    "point2d": [
+      44.38982009887695,
+      -45.27417755126953
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 703,
+    "title": "HAUSER: Towards Holistic and Automatic Evaluation of Simile Generation",
+    "abstract": "Similes play an imperative role in creative writing such as story and dialogue generation. Proper evaluation metrics are like a beacon guiding the research of simile generation (SG). However, it remains under-explored as to what criteria should be considered, how to quantify each criterion into metrics, and whether the metrics are effective for comprehensive, efficient, and reliable SG evaluation. To address the issues, we establish HAUSER, a holistic and automatic evaluation system for the SG task, which consists of five criteria from three perspectives and automatic metrics for each criterion. Through extensive experiments, we verify that our metrics are significantly more correlated with human ratings from each perspective compared with prior automatic metrics. Resources of HAUSER are publicly available at https://github.com/Abbey4799/HAUSER.",
+    "authors": [
+      "Qianyu He",
+      "Yikai Zhang",
+      "Jiaqing Liang",
+      "Yuncheng Huang",
+      "Yanghua Xiao",
+      "Yunwen Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.702",
+    "point2d": [
+      -26.61119842529297,
+      42.799232482910156
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 704,
+    "title": "Large-scale Lifelong Learning of In-context Instructions and How to Tackle It",
+    "abstract": "Jointly fine-tuning a Pre-trained Language Model (PLM) on a pre-defined set of tasks with in-context instructions has been proven to improve its generalization performance, allowing us to build a universal language model that can be deployed across task boundaries. In this work, we explore for the first time whether this attractive property of in-context instruction learning can be extended to a scenario in which tasks are fed to the target PLM in a sequential manner. The primary objective of so-called lifelong in-context instruction learning is to improve the target PLM\u2019s instance- and task-level generalization performance as it observes more tasks. DynaInst, the proposed method to lifelong in-context instruction learning, achieves noticeable improvements in both types of generalization, nearly reaching the upper bound performance obtained through joint training.",
+    "authors": [
+      "Jisoo Mok",
+      "Jaeyoung Do",
+      "Sungjin Lee",
+      "Tara Taghavi",
+      "Seunghak Yu",
+      "Sungroh Yoon"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.703",
+    "point2d": [
+      -16.5307674407959,
+      -19.85917854309082
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 705,
+    "title": "Controllable Text Generation via Probability Density Estimation in the Latent Space",
+    "abstract": "Previous work on controllable text generation has explored the idea of control from the latent space, such as optimizing a representation with attribute-specific classifiers or sampling one from relevant discrete samples. However, they cannot effectively model a complex space with diverse attributes, high dimensionality, and asymmetric structure, leaving subsequent controls unsatisfying. In this work, we propose a novel control framework using probability density estimation in the latent space. Our method utilizes an invertible transformation function, the Normalizing Flow, that maps the complex distributions in the latent space to simple Gaussian distributions in the prior space. Thus, we can perform sophisticated and flexible controls in the prior space and feed the control effects back into the latent space owing to the bijection property of invertible transformations. Experiments on single-attribute and multi-attribute control reveal that our method outperforms several strong baselines on attribute relevance and text quality, achieving a new SOTA. Further analysis of control strength adjustment demonstrates the flexibility of our control strategy.",
+    "authors": [
+      "Yuxuan Gu",
+      "Xiaocheng Feng",
+      "Sicheng Ma",
+      "Lingyuan Zhang",
+      "Heng Gong",
+      "Weihong Zhong",
+      "Bing Qin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.704",
+    "point2d": [
+      -22.281972885131836,
+      6.9023213386535645
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 706,
+    "title": "Learning Latent Relations for Temporal Knowledge Graph Reasoning",
+    "abstract": "Temporal Knowledge Graph (TKG) reasoning aims to predict future facts based on historical data. However, due to the limitations in construction tools and data sources, many important associations between entities may be omitted in TKG. We refer to these missing associations as latent relations. Most existing methods have some drawbacks in explicitly capturing intra-time latent relations between co-occurring entities and inter-time latent relations between entities that appear at different times. To tackle these problems, we propose a novel Latent relations Learning method for TKG reasoning, namely L2TKG. Specifically, we first utilize a Structural Encoder (SE) to obtain representations of entities at each timestamp. We then design a Latent Relations Learning (LRL) module to mine and exploit the intra- and inter-time latent relations. Finally, we extract the temporal representations from the output of SE and LRL for entity prediction. Extensive experiments on four datasets demonstrate the effectiveness of L2TKG.",
+    "authors": [
+      "Mengqi Zhang",
+      "Yuwei Xia",
+      "Qiang Liu",
+      "Shu Wu",
+      "Liang Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.705",
+    "point2d": [
+      58.84624481201172,
+      -61.499351501464844
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 707,
+    "title": "DT-Solver: Automated Theorem Proving with Dynamic-Tree Sampling Guided by Proof-level Value Function",
+    "abstract": "Recent advances in neural theorem-proving resort to large language models and tree searches. When proving a theorem, a language model advises single-step actions based on the current proving state and the tree search finds a sequence of correct steps using actions given by the language model. However, prior works often conduct constant computation efforts for each proving state while ignoring that the hard states often need more exploration than easy states. Moreover, they evaluate and guide the proof search solely depending on the current proof state instead of considering the whole proof trajectory as human reasoning does. Here, to accommodate general theorems, we propose a novel Dynamic-Tree Driven Theorem Solver (DT-Solver) by guiding the search procedure with state confidence and proof-level values. Specifically, DT-Solver introduces a dynamic-tree Monte-Carlo search algorithm, which dynamically allocates computing budgets for different state confidences, guided by a new proof-level value function to discover proof states that require substantial exploration.Experiments on two popular theorem-proving datasets, PISA and Mathlib, show significant performance gains by our DT-Solver over the state-of-the-art approaches, with a 6.65% improvement on average in terms of success rate. And especially under low computing resource settings (11.03% improvement on average).",
+    "authors": [
+      "Haiming Wang",
+      "Ye Yuan",
+      "Zhengying Liu",
+      "Jianhao Shen",
+      "Yichun Yin",
+      "Jing Xiong",
+      "Enze Xie",
+      "Han Shi",
+      "Yujun Li",
+      "Lin Li",
+      "Jian Yin",
+      "Zhenguo Li",
+      "Xiaodan Liang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.706",
+    "point2d": [
+      50.63492202758789,
+      -18.219484329223633
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 708,
+    "title": "Unsupervised Selective Rationalization with Noise Injection",
+    "abstract": "A major issue with using deep learning models in sensitive applications is that they provide no explanation for their output. To address this problem, unsupervised selective rationalization produces rationales alongside predictions by chaining two jointly-trained components, a rationale generator and a predictor. Although this architecture guarantees that the prediction relies solely on the rationale, it does not ensure that the rationale contains a plausible explanation for the prediction. We introduce a novel training technique that effectively limits generation of implausible rationales by injecting noise between the generator and the predictor. Furthermore, we propose a new benchmark for evaluating unsupervised selective rationalization models using movie reviews from existing datasets. We achieve sizeable improvements in rationale plausibility and task accuracy over the state-of-the-art across a variety of tasks, including our new benchmark, while maintaining or improving model faithfulness.",
+    "authors": [
+      "Adam Storek",
+      "Melanie Subbiah",
+      "Kathleen McKeown"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.707",
+    "point2d": [
+      28.918750762939453,
+      -6.0342793464660645
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 709,
+    "title": "Understanding In-Context Learning via Supportive Pretraining Data",
+    "abstract": "In-context learning (ICL) improves language models\u2019 performance on a variety of NLP tasks by simply demonstrating a handful of examples at inference time. It is not well understood why ICL ability emerges, as the model has never been specifically trained on such demonstrations. Unlike prior work that explores implicit mechanisms behind ICL, we study ICL via investigating the pretraining data. Specifically, we first adapt an iterative, gradient-based approach to find a small subset of pretraining data that supports ICL. We observe that a continued pretraining on this small subset significantly improves the model\u2019s ICL ability, by up to 18%. We then compare the supportive subset constrastively with random subsets of pretraining data and discover: (1) The supportive pretraining data to ICL do not have a higher domain relevance to downstream tasks. (2) The supportive pretraining data have a higher mass of rarely occurring, long-tail tokens. (3) The supportive pretraining data are challenging examples where the information gain from long-range context is below average, indicating learning to incorporate difficult long-range context encourages ICL. Our work takes a first step towards understanding ICL via analyzing instance-level pretraining data. Our insights have a potential to enhance the ICL ability of language models by actively guiding the construction of pretraining data in the future.",
+    "authors": [
+      "Xiaochuang Han",
+      "Daniel Simig",
+      "Todor Mihaylov",
+      "Yulia Tsvetkov",
+      "Asli Celikyilmaz",
+      "Tianlu Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.708",
+    "point2d": [
+      -14.478513717651367,
+      -22.236835479736328
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 710,
+    "title": "ETHICIST: Targeted Training Data Extraction Through Loss Smoothed Soft Prompting and Calibrated Confidence Estimation",
+    "abstract": "Large pre-trained language models achieve impressive results across many tasks. However, recent works point out that pre-trained language models may memorize a considerable fraction of their training data, leading to the privacy risk of information leakage. In this paper, we propose a method named Ethicist for targeted training data extraction through loss smoothed soft prompting and calibrated confidence estimation, investigating how to recover the suffix in the training data when given a prefix. To elicit memorization in the attacked model, we tune soft prompt embeddings while keeping the model fixed. We further propose a smoothing loss that smooths the loss distribution of the suffix tokens to make it easier to sample the correct suffix. In order to select the most probable suffix from a collection of sampled suffixes and estimate the prediction confidence, we propose a calibrated confidence estimation method, which normalizes the confidence of the generated suffixes with a local estimation. We show that Ethicist significantly improves the extraction performance on a recently proposed public benchmark. We also investigate several factors influencing the data extraction performance, including decoding strategy, model scale, prefix length, and suffix length. Our code is availabel at https://github.com/thu-coai/Targeted-Data-Extraction.",
+    "authors": [
+      "Zhexin Zhang",
+      "Jiaxin Wen",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.709",
+    "point2d": [
+      -2.409905195236206,
+      12.04038143157959
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 711,
+    "title": "Effective Contrastive Weighting for Dense Query Expansion",
+    "abstract": "Verbatim queries submitted to search engines often do not sufficiently describe the user\u2019s search intent. Pseudo-relevance feedback (PRF) techniques, which modify a query\u2019srepresentation using the top-ranked documents, have been shown to overcome such inadequacies and improve retrieval effectiveness for both lexical methods (e.g., BM25) and dense methods (e.g., ANCE, ColBERT). For instance, the recent ColBERT-PRF approach heuristically chooses new embeddings to add to the query representation using the inverse document frequency (IDF) of the underlying tokens. However, this heuristic potentially ignores the valuable context encoded by the embeddings. In this work, we present a contrastive solution that learns to select the most useful embeddings for expansion. More specifically, a deep language model-based contrastive weighting model, called CWPRF, is trained to learn to discriminate between relevant and non-relevant documents for semantic search. Our experimental results show that our contrastive weighting model can aid to select useful expansion embeddings and outperform various baselines. In particular, CWPRF can improve nDCG@10 by upto to 4.1% compared to an existing PRF approach for ColBERT while maintaining its efficiency.",
+    "authors": [
+      "Xiao Wang",
+      "Sean MacAvaney",
+      "Craig Macdonald",
+      "Iadh Ounis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.710",
+    "point2d": [
+      13.73311710357666,
+      -17.90531349182129
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 712,
+    "title": "Improving the Detection of Multilingual Online Attacks with Rich Social Media Data from Singapore",
+    "abstract": "Toxic content is a global problem, but most resources for detecting toxic content are in English. When datasets are created in other languages, they often focus exclusively on one language or dialect. In many cultural and geographical settings, however, it is common to code-mix languages, combining and interchanging them throughout conversations. To shine a light on this practice, and enable more research into code-mixed toxic content, we introduce SOA, a new multilingual dataset of online attacks. Using the multilingual city-state of Singapore as a starting point, we collect a large corpus of Reddit comments in Indonesian, Malay, Singlish, and other languages, and provide fine-grained hierarchical labels for online attacks. We publish the corpus with rich metadata, as well as additional unlabelled data for domain adaptation. We share comprehensive baseline results, show how the metadata can be used for granular error analysis, and demonstrate the benefits of domain adaptation for detecting multilingual online attacks.",
+    "authors": [
+      "Janosch Haber",
+      "Bertie Vidgen",
+      "Matthew Chapman",
+      "Vibhor Agarwal",
+      "Roy Ka-Wei Lee",
+      "Yong Keong Yap",
+      "Paul R\u00f6ttger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.711",
+    "point2d": [
+      30.85982894897461,
+      22.757442474365234
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 713,
+    "title": "Reanalyzing L2 Preposition Learning with Bayesian Mixed Effects and a Pretrained Language Model",
+    "abstract": "We use both Bayesian and neural models to dissect a data set of Chinese learners\u2019 pre- and post-interventional responses to two tests measuring their understanding of English prepositions. The results mostly replicate previous findings from frequentist analyses and newly reveal crucial interactions between student ability, task type, and stimulus sentence. Given the sparsity of the data as well as high diversity among learners, the Bayesian method proves most useful; but we also see potential in using language model probabilities as predictors of grammaticality and learnability.",
+    "authors": [
+      "Jakob Prange",
+      "Man Ho Ivy Wong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.712",
+    "point2d": [
+      -24.620635986328125,
+      -43.253013610839844
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 714,
+    "title": "Socratic Pretraining: Question-Driven Pretraining for Controllable Summarization",
+    "abstract": "In long document controllable summarization, where labeled data is scarce, pretrained models struggle to adapt to the task and effectively respond to user queries. In this paper, we introduce Socratic pretraining, a question-driven, unsupervised pretraining objective specifically designed to improve controllability in summarization tasks. By training a model to generate and answer relevant questions in a given context, Socratic pretraining enables the model to more effectively adhere to user-provided queries and identify relevant content to be summarized. We demonstrate the effectiveness of this approach through extensive experimentation on two summarization domains, short stories and dialogue, and multiple control strategies: keywords, questions, and factoid QA pairs. Our pretraining method relies only on unlabeled documents and a question generation system and outperforms pre-finetuning approaches that use additional supervised data. Furthermore, our results show that Socratic pretraining cuts task-specific labeled data requirements in half, is more faithful to user-provided queries, and achieves state-of-the-art performance on QMSum and SQuALITY.",
+    "authors": [
+      "Artidoro Pagnoni",
+      "Alex Fabbri",
+      "Wojciech Kryscinski",
+      "Chien-Sheng Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.713",
+    "point2d": [
+      56.235877990722656,
+      16.08646011352539
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 715,
+    "title": "MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering",
+    "abstract": "Visual language data such as plots, charts, and infographics are ubiquitous in the human world. However, state-of-the-art vision-language models do not perform well on these data. We propose MatCha (Math reasoning and Chart derendering pretraining) to enhance visual language models\u2019 capabilities in jointly modeling charts/plots and language data. Specifically, we propose several pretraining tasks that cover plot deconstruction and numerical reasoning which are the key capabilities in visual language modeling. We perform the MatCha pretraining starting from Pix2Struct, a recently proposed image-to-text visual language model. On standard benchmarks such as PlotQA and ChartQA, the MatCha model outperforms state-of-the-art methods by as much as nearly 20%. We also examine how well MatCha pretraining transfers to domains such as screenshots, textbook diagrams, and document figures and observe overall improvement, verifying the usefulness of MatCha pretraining on broader visual language tasks.",
+    "authors": [
+      "Fangyu Liu",
+      "Francesco Piccinno",
+      "Syrine Krichene",
+      "Chenxi Pang",
+      "Kenton Lee",
+      "Mandar Joshi",
+      "Yasemin Altun",
+      "Nigel Collier",
+      "Julian Eisenschlos"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.714",
+    "point2d": [
+      -49.937225341796875,
+      49.0928840637207
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 716,
+    "title": "MGR: Multi-generator Based Rationalization",
+    "abstract": "Rationalization is to employ a generator and a predictor to construct a self-explaining NLP model in which the generator selects a subset of human-intelligible pieces of the input text to the following predictor. However, rationalization suffers from two key challenges, i.e., spurious correlation and degeneration, where the predictor overfits the spurious or meaningless pieces solely selected by the not-yet well-trained generator and in turn deteriorates the generator. Although many studies have been proposed to address the two challenges, they are usually designed separately and do not take both of them into account. In this paper, we propose a simple yet effective method named MGR to simultaneously solve the two problems. The key idea of MGR is to employ multiple generators such that the occurrence stability of real pieces is improved and more meaningful pieces are delivered to the predictor. Empirically, we show that MGR improves the F1 score by up to 20.9% as compared to state-of-the-art methods.",
+    "authors": [
+      "Wei Liu",
+      "Haozhao Wang",
+      "Jun Wang",
+      "Ruixuan Li",
+      "Xinyang Li",
+      "YuanKai Zhang",
+      "Yang Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.715",
+    "point2d": [
+      -22.558324813842773,
+      18.173561096191406
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 717,
+    "title": "BUMP: A Benchmark of Unfaithful Minimal Pairs for Meta-Evaluation of Faithfulness Metrics",
+    "abstract": "The proliferation of automatic faithfulness metrics for summarization has produced a need for benchmarks to evaluate them. While existing benchmarks measure the correlation with human judgements of faithfulness on model-generated summaries, they are insufficient for diagnosing whether metrics are: 1) consistent, i.e., indicate lower faithfulness as errors are introduced into a summary, 2) effective on human-written texts, and 3) sensitive to different error types (as summaries can contain multiple errors). To address these needs, we present a benchmark of unfaithful minimal pairs (BUMP), a dataset of 889 human-written, minimally different summary pairs, where a single error is introduced to a summary from the CNN/DailyMail dataset to produce an unfaithful summary. We find BUMP complements existing benchmarks in a number of ways: 1) the summaries in BUMP are harder to discriminate and less probable under SOTA summarization models, 2) unlike non-pair-based datasets, BUMP can be used to measure the consistency of metrics, and reveals that the most discriminative metrics tend not to be the most consistent, and 3) unlike datasets containing generated summaries with multiple errors, BUMP enables the measurement of metrics\u2019 performance on individual error types.",
+    "authors": [
+      "Liang Ma",
+      "Shuyang Cao",
+      "Robert L Logan IV",
+      "Di Lu",
+      "Shihao Ran",
+      "Ke Zhang",
+      "Joel Tetreault",
+      "Alejandro Jaimes"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.716",
+    "point2d": [
+      -6.770626544952393,
+      45.570716857910156
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 718,
+    "title": "Is Fine-tuning Needed? Pre-trained Language Models Are Near Perfect for Out-of-Domain Detection",
+    "abstract": "Out-of-distribution (OOD) detection is a critical task for reliable predictions over text. Fine-tuning with pre-trained language models has been a de facto procedure to derive OOD detectors with respect to in-distribution (ID) data. Despite its common use, the understanding of the role of fine-tuning and its necessity for OOD detection is largely unexplored. In this paper, we raise the question: is fine-tuning necessary for OOD detection? We present a study investigating the efficacy of directly leveraging pre-trained language models for OOD detection, without any model fine-tuning on the ID data. We compare the approach with several competitive fine-tuning objectives, and offer new insights under various types of distributional shifts. Extensive experiments demonstrate near-perfect OOD detection performance (with 0% FPR95 in many cases), strongly outperforming the fine-tuned counterpart.",
+    "authors": [
+      "Rheeya Uppaal",
+      "Junjie Hu",
+      "Yixuan Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.717",
+    "point2d": [
+      -1.9348564147949219,
+      -6.223448276519775
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 719,
+    "title": "UniSumm and SummZoo: Unified Model and Diverse Benchmark for Few-Shot Summarization",
+    "abstract": "The high annotation costs and diverse demands of various summarization tasks motivate the development of few-shot summarization.However, despite the emergence of many summarization tasks and datasets, the current training paradigm for few-shot summarization systems ignores potentially shareable knowledge in heterogeneous datasets.To this end, we propose UniSumm, a unified few-shot summarization model pre-trained with multiple summarization tasks and can be prefix-tuned to excel at any few-shot summarization task.Meanwhile, to better evaluate few-shot summarizers, under the principles of diversity and robustness, we assemble and release a new benchmark SummZoo. It consists of 8 summarization tasks with multiple sets of few-shot samples for each task, covering diverse domains.Experimental results and analysis show that UniSumm outperforms strong baselines by a large margin across all sub-tasks in SummZoo under both automatic and human evaluations and achieves comparable results in human evaluation compared with a GPT-3.5 model.",
+    "authors": [
+      "Yulong Chen",
+      "Yang Liu",
+      "Ruochen Xu",
+      "Ziyi Yang",
+      "Chenguang Zhu",
+      "Michael Zeng",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.718",
+    "point2d": [
+      -9.283943176269531,
+      41.17637634277344
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 720,
+    "title": "RADE: Reference-Assisted Dialogue Evaluation for Open-Domain Dialogue",
+    "abstract": "Evaluating open-domain dialogue systems is challenging for reasons such as the one-to-many problem, i.e., many appropriate responses other than just the golden response. As of now, automatic evaluation methods need better consistency with humans, while reliable human evaluation can be time- and cost-intensive. To this end, we propose the Reference-Assisted Dialogue Evaluation (RADE) approach under the multi-task learning framework, which leverages the pre-created utterance as reference other than the gold response to relief the one-to-many problem. Specifically, RADE explicitly compares reference and the candidate response to predict their overall scores.Moreover, an auxiliary response generation task enhances prediction via a shared encoder.To support RADE, we extend three datasets with additional rated responses other than just a golden response by human annotation.Experiments on our three datasets and two existing benchmarks demonstrate the effectiveness of our method, where Pearson, Spearman, and Kendall correlations with human evaluation outperform state-of-the-art baselines.",
+    "authors": [
+      "Zhengliang Shi",
+      "Weiwei Sun",
+      "Shuo Zhang",
+      "Zhen Zhang",
+      "Pengjie Ren",
+      "Zhaochun Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.719",
+    "point2d": [
+      13.82765007019043,
+      68.30233001708984
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 721,
+    "title": "An AMR-based Link Prediction Approach for Document-level Event Argument Extraction",
+    "abstract": "Recent works have introduced Abstract Meaning Representation (AMR) for Document-level Event Argument Extraction (Doc-level EAE), since AMR provides a useful interpretation of complex semantic structures and helps to capture long-distance dependency. However, in these works AMR is used only implicitly, for instance, as additional features or training signals. Motivated by the fact that all event structures can be inferred from AMR, this work reformulates EAE as a link prediction problem on AMR graphs. Since AMR is a generic structure and does not perfectly suit EAE, we propose a novel graph structure, Tailored AMR Graph (TAG), which compresses less informative subgraphs and edge types, integrates span information, and highlights surrounding events in the same document. With TAG, we further propose a novel method using graph neural networks as a link prediction model to find event arguments. Our extensive experiments on WikiEvents and RAMS show that this simpler approach outperforms the state-of-the-art models by 3.63pt and 2.33pt F1, respectively, and do so with reduced 56% inference time.",
+    "authors": [
+      "Yuqing Yang",
+      "Qipeng Guo",
+      "Xiangkun Hu",
+      "Yue Zhang",
+      "Xipeng Qiu",
+      "Zheng Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.720",
+    "point2d": [
+      45.08207702636719,
+      -48.150184631347656
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 722,
+    "title": "PuMer: Pruning and Merging Tokens for Efficient Vision Language Models",
+    "abstract": "Large-scale vision language (VL) models use Transformers to perform cross-modal interactions between the input text and image. These cross-modal interactions are computationally expensive and memory-intensive due to the quadratic complexity of processing the input image and text. We present PuMer: a token reduction framework that uses text-informed Pruning and modality-aware Merging strategies to progressively reduce the tokens of input image and text, improving model inference speed and reducing memory footprint. PuMer learns to keep salient image tokens related to the input text and merges similar textual and visual tokens by adding lightweight token reducer modules at several cross-modal layers in the VL model. Training PuMer is mostly the same as finetuning the original VL model but faster. Our evaluation for two vision language models on four downstream VL tasks shows PuMer increases inference throughput by up to 2x and reduces memory footprint by over 50% while incurring less than a 1% accuracy drop.",
+    "authors": [
+      "Qingqing Cao",
+      "Bhargavi Paranjape",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.721",
+    "point2d": [
+      -54.45258712768555,
+      34.60423278808594
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 723,
+    "title": "Gloss-Free End-to-End Sign Language Translation",
+    "abstract": "In this paper, we tackle the problem of sign language translation (SLT) without gloss annotations. Although intermediate representation like gloss has been proven effective, gloss annotations are hard to acquire, especially in large quantities. This limits the domain coverage of translation datasets, thus handicapping real-world applications. To mitigate this problem, we design the Gloss-Free End-to-end sign language translation framework (GloFE). Our method improves the performance of SLT in the gloss-free setting by exploiting the shared underlying semantics of signs and the corresponding spoken translation. Common concepts are extracted from the text and used as a weak form of intermediate representation. The global embedding of these concepts is used as a query for cross-attention to find the corresponding information within the learned visual features. In a contrastive manner, we encourage the similarity of query results between samples containing such concepts and decrease those that do not. We obtained state-of-the-art results on large-scale datasets, including OpenASL and How2Sign.",
+    "authors": [
+      "Kezhou Lin",
+      "Xiaohan Wang",
+      "Linchao Zhu",
+      "Ke Sun",
+      "Bang Zhang",
+      "Yi Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.722",
+    "point2d": [
+      -80.6490249633789,
+      21.798025131225586
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 724,
+    "title": "TAGPRIME: A Unified Framework for Relational Structure Extraction",
+    "abstract": "Many tasks in natural language processing require the extraction of relationship information for a given condition, such as event argument extraction, relation extraction, and task-oriented semantic parsing. Recent works usually propose sophisticated models for each task independently and pay less attention to the commonality of these tasks and to have a unified framework for all the tasks. In this work, we propose to take a unified view of all these tasks and introduce TAGPRIME to address relational structure extraction problems. TAGPRIME is a sequence tagging model that appends priming words about the information of the given condition (such as an event trigger) to the input text. With the self-attention mechanism in pre-trained language models, the priming words make the output contextualized representations contain more information about the given condition, and hence become more suitable for extracting specific relationships for the condition. Extensive experiments and analyses on three different tasks that cover ten datasets across five different languages demonstrate the generality and effectiveness of TAGPRIME.",
+    "authors": [
+      "I-Hung Hsu",
+      "Kuan-Hao Huang",
+      "Shuning Zhang",
+      "Wenxin Cheng",
+      "Prem Natarajan",
+      "Kai-Wei Chang",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.723",
+    "point2d": [
+      43.137786865234375,
+      -49.64680862426758
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 725,
+    "title": "Model-Generated Pretraining Signals Improves Zero-Shot Generalization of Text-to-Text Transformers",
+    "abstract": "This paper explores the effectiveness of model-generated signals in improving zero-shot generalization of text-to-text Transformers such as T5. We study various designs to pretrain T5 using an auxiliary model to construct more challenging token replacements for the main model to denoise. Key aspects under study include the decoding target, the location of the RTD head, and the masking pattern. Based on these studies, we develop a new model, METRO-T0, which is pretrained using the redesigned ELECTRA-Style pretraining strategies and then prompt-finetuned on a mixture of NLP tasks. METRO-T0 outperforms all similar-sized baselines on prompted NLP benchmarks, such as _T0 Eval_ and MMLU, and rivals the state-of-the-art T0-11B model with only **8%** of its parameters. Our analysis on model\u2019s neural activation and parameter sensitivity reveals that the effectiveness of METRO-T0 stems from more balanced contribution of parameters and better utilization of their capacity. The code and model checkpoints are available at [https://github.com/gonglinyuan/metro_t0](https://github.com/gonglinyuan/metro_t0).",
+    "authors": [
+      "Linyuan Gong",
+      "Chenyan Xiong",
+      "Xiaodong Liu",
+      "Payal Bajaj",
+      "Yiqing Xie",
+      "Alvin Cheung",
+      "Jianfeng Gao",
+      "Xia Song"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.724",
+    "point2d": [
+      -35.582794189453125,
+      -24.234907150268555
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 726,
+    "title": "BITE: Textual Backdoor Attacks with Iterative Trigger Injection",
+    "abstract": "Backdoor attacks have become an emerging threat to NLP systems. By providing poisoned training data, the adversary can embed a \u201cbackdoor\u201d into the victim model, which allows input instances satisfying certain textual patterns (e.g., containing a keyword) to be predicted as a target label of the adversary\u2019s choice. In this paper, we demonstrate that it is possible to design a backdoor attack that is both stealthy (i.e., hard to notice) and effective (i.e., has a high attack success rate). We propose BITE, a backdoor attack that poisons the training data to establish strong correlations between the target label and a set of \u201ctrigger words\u201d. These trigger words are iteratively identified and injected into the target-label instances through natural word-level perturbations. The poisoned training data instruct the victim model to predict the target label on inputs containing trigger words, forming the backdoor. Experiments on four text classification datasets show that our proposed attack is significantly more effective than baseline methods while maintaining decent stealthiness, raising alarm on the usage of untrusted training data. We further propose a defense method named DeBITE based on potential trigger word removal, which outperforms existing methods in defending against BITE and generalizes well to handling other backdoor attacks.",
+    "authors": [
+      "Jun Yan",
+      "Vansh Gupta",
+      "Xiang Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.725",
+    "point2d": [
+      0.39192262291908264,
+      9.669949531555176
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 727,
+    "title": "A Crosslingual Investigation of Conceptualization in 1335 Languages",
+    "abstract": "Languages differ in how they divide up the world into concepts and words; e.g., in contrast to English, Swahili has a single concept for \u2018belly\u2019 and \u2018womb\u2019. We investigate these differences in conceptualization across 1,335 languages by aligning concepts in a parallel corpus. To this end, we propose Conceptualizer, a method that creates a bipartite directed alignment graph between source language concepts and sets of target language strings. In a detailed linguistic analysis across all languages for one concept (\u2018bird\u2019) and an evaluation on gold standard data for 32 Swadesh concepts, we show that Conceptualizer has good alignment accuracy. We demonstrate the potential of research on conceptualization in NLP with two experiments. (1) We define crosslingual stability of a concept as the degree to which it has 1-1 correspondences across languages, and show that concreteness predicts stability. (2) We represent each language by its conceptualization pattern for 83 concepts, and define a similarity measure on these representations. The resulting measure for the conceptual similarity between two languages is complementary to standard genealogical, typological, and surface similarity measures. For four out of six language families, we can assign languages to their correct family based on conceptual similarity with accuracies between 54% and 87%",
+    "authors": [
+      "Yihong Liu",
+      "Haotian Ye",
+      "Leonie Weissweiler",
+      "Philipp Wicke",
+      "Renhao Pei",
+      "Robert Zangenfeind",
+      "Hinrich Sch\u00fctze"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.726",
+    "point2d": [
+      5.837584018707275,
+      -47.06345748901367
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 728,
+    "title": "Exploring and Verbalizing Academic Ideas by Concept Co-occurrence",
+    "abstract": "Researchers usually come up with new ideas only after thoroughly comprehending vast quantities of literature. The difficulty of this procedure is exacerbated by the fact that the number of academic publications is growing exponentially. In this study, we devise a framework based on concept co-occurrence for academic idea inspiration, which has been integrated into a research assistant system. From our perspective, the emergence of a new idea can be regarded as the fusion of two concepts that co-occur in an academic paper. We construct evolving concept graphs according to the co-occurrence relationship of concepts from 20 disciplines or topics. Then we design a temporal link prediction method based on masked language model to explore potential connections between different concepts. To verbalize the newly discovered connections, we also utilize the pretrained language model to generate a description of an idea based on a new data structure called co-occurrence citation quintuple. We evaluate our proposed system using both automatic metrics and human assessment. The results demonstrate that our system has broad prospects and can assist researchers in expediting the process of discovering new ideas.",
+    "authors": [
+      "Yi Xu",
+      "Shuqian Sheng",
+      "Bo Xue",
+      "Luoyi Fu",
+      "Xinbing Wang",
+      "Chenghu Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.727",
+    "point2d": [
+      50.0356559753418,
+      -59.114349365234375
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 729,
+    "title": "mCLIP: Multilingual CLIP via Cross-lingual Transfer",
+    "abstract": "Large-scale vision-language pretrained (VLP) models like CLIP have shown remarkable performance on various downstream cross-modal tasks. However, they are usually biased towards English due to the lack of sufficient non-English image-text pairs. Existing multilingual VLP methods often learn retrieval-inefficient single-stream models by translation-augmented non-English image-text pairs. In this paper, we introduce mCLIP, a retrieval-efficient dual-stream multilingual VLP model, trained by aligning the CLIP model and a Multilingual Text Encoder (MTE) through a novel Triangle Cross-modal Knowledge Distillation (TriKD) method. It is parameter-efficient as only two light projectors on the top of them are updated during distillation. Furthermore, to enhance the token- and sentence-level multilingual representation of the MTE, we propose to train it with machine translation and contrastive learning jointly before the TriKD to provide a better initialization. Empirical results show that mCLIP achieves new state-of-the-art performance for both zero-shot and finetuned multilingual image-text retrieval task.",
+    "authors": [
+      "Guanhua Chen",
+      "Lu Hou",
+      "Yun Chen",
+      "Wenliang Dai",
+      "Lifeng Shang",
+      "Xin Jiang",
+      "Qun Liu",
+      "Jia Pan",
+      "Wenping Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.728",
+    "point2d": [
+      -57.53701400756836,
+      36.63343048095703
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 730,
+    "title": "Distantly Supervised Course Concept Extraction in MOOCs with Academic Discipline",
+    "abstract": "With the rapid growth of Massive Open Online Courses (MOOCs), it is expensive and time-consuming to extract high-quality knowledgeable concepts taught in the course by human effort to help learners grasp the essence of the course. In this paper, we propose to automatically extract course concepts using distant supervision to eliminate the heavy work of human annotations, which generates labels by matching them with an easily accessed dictionary. However, this matching process suffers from severe noisy and incomplete annotations because of the limited dictionary and diverse MOOCs. To tackle these challenges, we present a novel three-stage framework DS-MOCE, which leverages the power of pre-trained language models explicitly and implicitly and employs discipline-embedding models with a self-train strategy based on label generation refinement across different domains. We also provide an expert-labeled dataset spanning 20 academic disciplines. Experimental results demonstrate the superiority of DS-MOCE over the state-of-the-art distantly supervised methods (with 7% absolute F1 score improvement). Code and data are now available at https://github.com/THU-KEG/MOOC-NER.",
+    "authors": [
+      "Mengying Lu",
+      "Yuquan Wang",
+      "Jifan Yu",
+      "Yexing Du",
+      "Lei Hou",
+      "Juanzi Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.729",
+    "point2d": [
+      -12.912042617797852,
+      -39.75105667114258
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 731,
+    "title": "Extrinsic Evaluation of Machine Translation Metrics",
+    "abstract": "Automatic machine translation (MT) metrics are widely used to distinguish the quality of machine translation systems across relatively large test sets (system-level evaluation). However, it is unclear if automatic metrics are reliable at distinguishing good translations from bad translations at the sentence level (segment-level evaluation). In this paper, we investigate how useful MT metrics are at detecting the segment-level quality by correlating metrics with how useful the translations are for downstream task.We evaluate the segment-level performance of the most widely used MT metrics (chrF, COMET, BERTScore, etc.) on three downstream cross-lingual tasks (dialogue state tracking, question answering, and semantic parsing). For each task, we only have access to a monolingual task-specific model and a translation model. We calculate the correlation between the metric\u2019s ability to predict a good/bad translation with the success/failure on the final task for the machine translated test sentences. Our experiments demonstrate that all metrics exhibit negligible correlation with the extrinsic evaluation of the downstream outcomes. We also find that the scores provided by neural metrics are not interpretable, in large part due to having undefined ranges. We synthesise our analysis into recommendations for future MT metrics to produce labels rather than scores for more informative interaction between machine translation and multilingual language understanding.",
+    "authors": [
+      "Nikita Moghe",
+      "Tom Sherborne",
+      "Mark Steedman",
+      "Alexandra Birch"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.730",
+    "point2d": [
+      -74.87210845947266,
+      -2.9965786933898926
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 732,
+    "title": "ExplainMeetSum: A Dataset for Explainable Meeting Summarization Aligned with Human Intent",
+    "abstract": "To enhance the explainability of meeting summarization, we construct a new dataset called \u201cExplainMeetSum,\u201d an augmented version of QMSum, by newly annotating evidence sentences that faithfully \u201cexplain\u201d a summary. Using ExplainMeetSum, we propose a novel multiple extractor guided summarization, namely Multi-DYLE, which extensively generalizes DYLE to enable using a supervised extractor based on human-aligned extractive oracles. We further present an explainability-aware task, named \u201cExplainable Evidence Extraction\u201d (E3), which aims to automatically detect all evidence sentences that support a given summary. Experimental results on the QMSum dataset show that the proposed Multi-DYLE outperforms DYLE with gains of up to 3.13 in the ROUGE-1 score. We further present the initial results on the E3 task, under the settings using separate and joint evaluation metrics.",
+    "authors": [
+      "Hyun Kim",
+      "Minsoo Cho",
+      "Seung-Hoon Na"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.731",
+    "point2d": [
+      -9.559903144836426,
+      44.41742706298828
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 733,
+    "title": "A Cross-Modality Context Fusion and Semantic Refinement Network for Emotion Recognition in Conversation",
+    "abstract": "Emotion recognition in conversation (ERC) has attracted enormous attention for its applications in empathetic dialogue systems. However, most previous researches simply concatenate multimodal representations, leading to an accumulation of redundant information and a limited context interaction between modalities. Furthermore, they only consider simple contextual features ignoring semantic clues, resulting in an insufficient capture of the semantic coherence and consistency in conversations. To address these limitations, we propose a cross-modality context fusion and semantic refinement network (CMCF-SRNet). Specifically, we first design a cross-modal locality-constrained transformer to explore the multimodal interaction. Second, we investigate a graph-based semantic refinement transformer, which solves the limitation of insufficient semantic relationship information between utterances. Extensive experiments on two public benchmark datasets show the effectiveness of our proposed method compared with other state-of-the-art methods, indicating its potential application in emotion recognition. Our model will be available at https://github.com/zxiaohen/CMCF-SRNet.",
+    "authors": [
+      "Xiaoheng Zhang",
+      "Yang Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.732",
+    "point2d": [
+      -36.899654388427734,
+      63.82816696166992
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 734,
+    "title": "CAT: A Contextualized Conceptualization and Instantiation Framework for Commonsense Reasoning",
+    "abstract": "Commonsense reasoning, aiming at endowing machines with a human-like ability to make situational presumptions, is extremely challenging to generalize.For someone who barely knows about \u201cmeditation,\u201d while is knowledgeable about \u201csinging,\u201d he can still infer that \u201cmeditation makes people relaxed\u201d from the existing knowledge that \u201csinging makes people relaxed\u201d by first conceptualizing \u201csinging\u201d as a \u201crelaxing event\u201d and then instantiating that event to \u201cmeditation.\u201dThis process, known as conceptual induction and deduction, is fundamental to commonsense reasoning while lacking both labeled data and methodologies to enhance commonsense modeling.To fill such a research gap, we propose CAT (Contextualized ConceptuAlization and InsTantiation),a semi-supervised learning framework that integrates event conceptualization and instantiation to conceptualize commonsense knowledge bases at scale.Extensive experiments show that our framework achieves state-of-the-art performances on two conceptualization tasks, and the acquired abstract commonsense knowledge can significantly improve commonsense inference modeling.Our code, data, and fine-tuned models are publicly available at [https://github.com/HKUST-KnowComp/CAT](https://github.com/HKUST-KnowComp/CAT).",
+    "authors": [
+      "Weiqi Wang",
+      "Tianqing Fang",
+      "Baixuan Xu",
+      "Chun Yi Louis Bo",
+      "Yangqiu Song",
+      "Lei Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.733",
+    "point2d": [
+      57.0453987121582,
+      -19.448026657104492
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 735,
+    "title": "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research",
+    "abstract": "Recent advances in deep learning methods for natural language processing (NLP) have created new business opportunities and made NLP research critical for industry development. As one of the big players in the field of NLP, together with governments and universities, it is important to track the influence of industry on research. In this study, we seek to quantify and characterize industry presence in the NLP community over time. Using a corpus with comprehensive metadata of 78,187 NLP publications and 701 resumes of NLP publication authors, we explore the industry presence in the field since the early 90s. We find that industry presence among NLP authors has been steady before a steep increase over the past five years (180% growth from 2017 to 2022). A few companies account for most of the publications and provide funding to academic researchers through grants and internships. Our study shows that the presence and impact of the industry on natural language processing research are significant and fast-growing. This work calls for increased transparency of industry influence in the field.",
+    "authors": [
+      "Mohamed Abdalla",
+      "Jan Philip Wahle",
+      "Terry Lima Ruas",
+      "Aur\u00e9lie N\u00e9v\u00e9ol",
+      "Fanny Ducel",
+      "Saif Mohammad",
+      "Karen Fort"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.734",
+    "point2d": [
+      20.21767807006836,
+      14.131454467773438
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 736,
+    "title": "Language of Bargaining",
+    "abstract": "Leveraging an established exercise in negotiation education, we build a novel dataset for studying how the use of language shapes bilateral bargaining. Our dataset extends existing work in two ways: 1) we recruit participants via behavioral labs instead of crowdsourcing platforms and allow participants to negotiate through audio, enabling more naturalistic interactions; 2) we add a control setting where participants negotiate only through alternating, written numeric offers. Despite the two contrasting forms of communication, we find that the average agreed prices of the two treatments are identical. But when subjects can talk, fewer offers are exchanged, negotiations finish faster, the likelihood of reaching agreement rises, and the variance of prices at which subjects agree drops substantially. We further propose a taxonomy of speech acts in negotiation and enrich the dataset with annotated speech acts. We set up prediction tasks to predict negotiation success and find that being reactive to the arguments of the other party is advantageous over driving the negotiation.",
+    "authors": [
+      "Mourad Heddaya",
+      "Solomon Dworkin",
+      "Chenhao Tan",
+      "Rob Voigt",
+      "Alexander Zentefis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.735",
+    "point2d": [
+      26.88296890258789,
+      40.77985763549805
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 737,
+    "title": "Do Question Answering Modeling Improvements Hold Across Benchmarks?",
+    "abstract": "Do question answering (QA) modeling improvements (e.g., choice of architecture and training procedure) hold consistently across the diverse landscape of QA benchmarks? To study this question, we introduce the notion of concurrence\u2014two benchmarks have high concurrence on a set of modeling approaches if they rank the modeling approaches similarly. We measure the concurrence between 32 QA benchmarks on a set of 20 diverse modeling approaches and find that human-constructed benchmarks have high concurrence amongst themselves, even if their passage and question distributions are very different. Surprisingly, even downsampled human-constructed benchmarks (i.e., collecting less data) and programmatically-generated benchmarks (e.g., cloze-formatted examples) have high concurrence with human-constructed benchmarks. These results indicate that, despite years of intense community focus on a small number of benchmarks, the modeling improvements studied hold broadly.",
+    "authors": [
+      "Nelson F. Liu",
+      "Tony Lee",
+      "Robin Jia",
+      "Percy Liang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.736",
+    "point2d": [
+      63.82573318481445,
+      12.5968656539917
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 738,
+    "title": "VLN-Trans: Translator for the Vision and Language Navigation Agent",
+    "abstract": "Language understanding is essential for the navigation agent to follow instructions. We observe two kinds of issues in the instructions that can make the navigation task challenging: 1. The mentioned landmarks are not recognizable by the navigation agent due to the different vision abilities of the instructor and the modeled agent. 2. The mentioned landmarks are applicable to multiple targets, thus not distinctive for selecting the target among the candidate viewpoints.To deal with these issues, we design a translator module for the navigation agent to convert the original instructions into easy-to-follow sub-instruction representations at each step. The translator needs to focus on the recognizable and distinctive landmarks based on the agent\u2019s visual abilities and the observed visual environment.To achieve this goal, we create a new synthetic sub-instruction dataset and design specific tasks to train the translator and the navigation agent.We evaluate our approach on Room2Room (R2R), Room4room (R4R), and Room2Room Last (R2R-Last) datasets and achieve state-of-the-art results on multiple benchmarks.",
+    "authors": [
+      "Yue Zhang",
+      "Parisa Kordjamshidi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.737",
+    "point2d": [
+      -21.658205032348633,
+      -15.623625755310059
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 739,
+    "title": "Bridging the Gap between Decision and Logits in Decision-based Knowledge Distillation for Pre-trained Language Models",
+    "abstract": "Conventional knowledge distillation (KD) methods require access to the internal information of teachers, e.g., logits. However, such information may not always be accessible for large pre-trained language models (PLMs). In this work, we focus on decision-based KD for PLMs, where only teacher decisions (i.e., top-1 labels) are accessible. Considering the information gap between logits and decisions, we propose a novel method to estimate logits from the decision distributions. Specifically, decision distributions can be both derived as a function of logits theoretically and estimated with test-time data augmentation empirically. By combining the theoretical and empirical estimations of the decision distributions together, the estimation of logits can be successfully reduced to a simple root-finding problem. Extensive experiments show that our method significantly outperforms strong baselines on both natural language understanding and machine reading comprehension datasets.",
+    "authors": [
+      "Qinhong Zhou",
+      "Zonghan Yang",
+      "Peng Li",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.738",
+    "point2d": [
+      -50.95370101928711,
+      -24.98257064819336
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 740,
+    "title": "Continual Contrastive Finetuning Improves Low-Resource Relation Extraction",
+    "abstract": "Relation extraction (RE), which has relied on structurally annotated corpora for model training, has been particularly challenging in low-resource scenarios and domains. Recent literature has tackled low-resource RE by self-supervised learning, where the solution involves pretraining the entity pair embedding by RE-based objective and finetuning on labeled data by classification-based objective. However, a critical challenge to this approach is the gap in objectives, which prevents the RE model from fully utilizing the knowledge in pretrained representations. In this paper, we aim at bridging the gap and propose to pretrain and finetune the RE model using consistent objectives of contrastive learning. Since in this kind of representation learning paradigm, one relation may easily form multiple clusters in the representation space, we further propose a multi-center contrastive loss that allows one relation to form multiple clusters to better align with pretraining. Experiments on two document-level RE datasets, BioRED and Re-DocRED, demonstrate the effectiveness of our method. Particularly, when using 1% end-task training data, our method outperforms PLM-based RE classifier by 10.5% and 6.1% on the two datasets, respectively.",
+    "authors": [
+      "Wenxuan Zhou",
+      "Sheng Zhang",
+      "Tristan Naumann",
+      "Muhao Chen",
+      "Hoifung Poon"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.739",
+    "point2d": [
+      40.81532669067383,
+      -64.34825134277344
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 741,
+    "title": "KGA: A General Machine Unlearning Framework Based on Knowledge Gap Alignment",
+    "abstract": "Recent legislation of the \u201cright to be forgotten\u201d has led to the interest in machine unlearning, where the learned models are endowed with the function to forget information about specific training instances as if they have never existed in the training set. Previous work mainly focuses on computer vision scenarios and largely ignores the essentials of unlearning in NLP field, where text data contains more explicit and sensitive personal information than images. In this paper, we propose a general unlearning framework called KGA to induce forgetfulness. Different from previous work that tries to recover gradients or forces models to perform close to one specific distribution, KGA maintains distribution differences (i.e., knowledge gap). This relaxes the distribution assumption. Furthermore, we first apply the unlearning method to various NLP tasks (i.e., classification, translation, response generation) and propose several unlearning evaluation metrics with pertinence. Experiments on large-scale datasets show that KGA yields comprehensive improvements over baselines, where extensive analyses further validate the effectiveness of KGA and provide insight into unlearning for NLP tasks.",
+    "authors": [
+      "Lingzhi Wang",
+      "Tong Chen",
+      "Wei Yuan",
+      "Xingshan Zeng",
+      "Kam-Fai Wong",
+      "Hongzhi Yin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.740",
+    "point2d": [
+      -28.5753173828125,
+      -20.73773193359375
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 742,
+    "title": "UniCoRN: Unified Cognitive Signal ReconstructioN bridging cognitive signals and human language",
+    "abstract": "Decoding text stimuli from cognitive signals (e.g. fMRI) enhances our understanding of the human language system, paving the way for building versatile Brain-Computer Interface. However, existing studies largely focus on decoding individual word-level fMRI volumes from a restricted vocabulary, which is far too idealized for real-world application. In this paper, we propose fMRI2text, the first open-vocabulary task aiming to bridge fMRI time series and human language. Furthermore, to explore the potential of this new task, we present a baseline solution, UniCoRN: the Unified Cognitive Signal ReconstructioN for Brain Decoding. By reconstructing both individual time points and time series, UniCoRN establishes a robust encoder for cognitive signals (fMRI & EEG). Leveraging a pre-trained language model as decoder, UniCoRN proves its efficacy in decoding coherent text from fMRI series across various split settings. Our model achieves a 34.77% BLEU score on fMRI2text, and a 37.04% BLEU when generalized to EEG-to-text decoding, thereby surpassing the former baseline. Experimental results indicate the feasibility of decoding consecutive fMRI volumes, and the effectiveness of decoding different cognitive signals using a unified structure.",
+    "authors": [
+      "Nuwa Xi",
+      "Sendong Zhao",
+      "Haochun Wang",
+      "Chi Liu",
+      "Bing Qin",
+      "Ting Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.741",
+    "point2d": [
+      -59.9044075012207,
+      25.326990127563477
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 743,
+    "title": "Dense-ATOMIC: Towards Densely-connected ATOMIC with High Knowledge Coverage and Massive Multi-hop Paths",
+    "abstract": "ATOMIC is a large-scale commonsense knowledge graph (CSKG) containing everyday if-then knowledge triplets, i.e., head event, relation, tail event. The one-hop annotation manner made ATOMIC a set of independent bipartite graphs, which ignored the numerous links between events in different bipartite graphs and consequently caused shortages in knowledge coverage and multi-hop paths. In this work, we aim to construct Dense-ATOMIC with high knowledge coverage and massive multi-hop paths. The events in ATOMIC are normalized to a consistent pattern at first. We then propose a CSKG completion method called Rel-CSKGC to predict the relation given the head event and the tail event of a triplet, and train a CSKG completion model based on existing triplets in ATOMIC. We finally utilize the model to complete the missing links in ATOMIC and accordingly construct Dense-ATOMIC. Both automatic and human evaluation on an annotated subgraph of ATOMIC demonstrate the advantage of Rel-CSKGC over strong baselines. We further conduct extensive evaluations on Dense-ATOMIC in terms of statistics, human evaluation, and simple downstream tasks, all proving Dense-ATOMIC\u2019s advantages in Knowledge Coverage and Multi-hop Paths. Both the source code of Rel-CSKGC and Dense-ATOMIC are publicly available on https://github.com/NUSTM/Dense-ATOMIC.",
+    "authors": [
+      "Xiangqing Shen",
+      "Siwei Wu",
+      "Rui Xia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.742",
+    "point2d": [
+      56.579261779785156,
+      -63.355690002441406
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 744,
+    "title": "Shrinking Embeddings for Hyper-Relational Knowledge Graphs",
+    "abstract": "Link prediction on knowledge graphs (KGs) has been extensively studied on binary relational KGs, wherein each fact is represented by a triple. A significant amount of important knowledge, however, is represented by hyper-relational facts where each fact is composed of a primal triple and a set of qualifiers comprising a key-value pair that allows for expressing more complicated semantics. Although some recent works have proposed to embed hyper-relational KGs, these methods fail to capture essential inference patterns of hyper-relational facts such as qualifier monotonicity, qualifier implication, and qualifier mutual exclusion, limiting their generalization capability. To unlock this, we present ShrinkE, a geometric hyper-relational KG embedding method aiming to explicitly model these patterns. ShrinkE models the primal triple as a spatial-functional transformation from the head into a relation-specific box. Each qualifier \u201cshrinks\u201d the box to narrow down the possible answer set and, thus, realizes qualifier monotonicity. The spatial relationships between the qualifier boxes allow for modeling core inference patterns of qualifiers such as implication and mutual exclusion. Experimental results demonstrate ShrinkE\u2019s superiority on three benchmarks of hyper-relational KGs.",
+    "authors": [
+      "Bo Xiong",
+      "Mojtaba Nayyeri",
+      "Shirui Pan",
+      "Steffen Staab"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.743",
+    "point2d": [
+      54.69093322753906,
+      -62.35207748413086
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 745,
+    "title": "CTC-based Non-autoregressive Speech Translation",
+    "abstract": "Combining end-to-end speech translation (ST) and non-autoregressive (NAR) generation is promising in language and speech processing for their advantages of less error propagation and low latency.In this paper, we investigate the potential of connectionist temporal classification (CTC) for non-autoregressive speech translation (NAST).In particular, we develop a model consisting of two encoders that are guided by CTC to predict the source and target texts, respectively.Introducing CTC into NAST on both language sides has obvious challenges: 1) the conditional independent generation somewhat breaks the interdependency among tokens, and 2) the monotonic alignment assumption in standard CTC does not hold in translation tasks.In response, we develop a prediction-aware encoding approach and a cross-layer attention approach to address these issues.We also use curriculum learning to improve convergence of training.Experiments on the MuST-C ST benchmarks show that our NAST model achieves an average BLEU score of 29.5 with a speed-up of 5.67\\times, which is comparable to the autoregressive counterpart and even outperforms the previous best result of 0.9 BLEU points.",
+    "authors": [
+      "Chen Xu",
+      "Xiaoqian Liu",
+      "Xiaowen Liu",
+      "Qingxuan Sun",
+      "Yuhao Zhang",
+      "Murun Yang",
+      "Qianqian Dong",
+      "Tom Ko",
+      "Mingxuan Wang",
+      "Tong Xiao",
+      "Anxiang Ma",
+      "Jingbo Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.744",
+    "point2d": [
+      -66.179443359375,
+      17.49823570251465
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 746,
+    "title": "Attention as a Guide for Simultaneous Speech Translation",
+    "abstract": "In simultaneous speech translation (SimulST), effective policies that determine when to write partial translations are crucial to reach high output quality with low latency. Towards this objective, we propose EDAtt (Encoder-Decoder Attention), an adaptive policy that exploits the attention patterns between audio source and target textual translation to guide an offline-trained ST model during simultaneous inference. EDAtt exploits the attention scores modeling the audio-translation relation to decide whether to emit a partial hypothesis or wait for more audio input. This is done under the assumption that, if attention is focused towards the most recently received speech segments, the information they provide can be insufficient to generate the hypothesis (indicating that the system has to wait for additional audio input). Results on en->de, es show that EDAtt yields better results compared to the SimulST state of the art, with gains respectively up to 7 and 4 BLEU points for the two languages, and with a reduction in computational-aware latency up to 1.4s and 0.7s compared to existing SimulST policies applied to offline-trained models.",
+    "authors": [
+      "Sara Papi",
+      "Matteo Negri",
+      "Marco Turchi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.745",
+    "point2d": [
+      -64.75476837158203,
+      18.945993423461914
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 747,
+    "title": "On Complementarity Objectives for Hybrid Retrieval",
+    "abstract": "Dense retrieval has shown promising results in various information retrieval tasks, and hybrid retrieval, combined with the strength of sparse retrieval, has also been actively studied. A key challenge in hybrid retrieval is to make sparse and dense complementary to each other. Existing models have focused on dense models to capture \u201cresidual\u201d features neglected in the sparse models. Our key distinction is to show how this notion of residual complementarity is limited, and propose a new objective, denoted as RoC (Ratio of Complementarity), which captures a fuller notion of complementarity. We propose a two-level orthogonality designed to improve RoC, then show that the improved RoC of our model, in turn, improves the performance of hybrid retrieval. Our method outperforms all state-of-the-art methods on three representative IR benchmarks: MSMARCO-Passage, Natural Questions, and TREC Robust04, with statistical significance. Our finding is also consistent in various adversarial settings.",
+    "authors": [
+      "Dohyeon Lee",
+      "Seung-won Hwang",
+      "Kyungjae Lee",
+      "Seungtaek Choi",
+      "Sunghyun Park"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.746",
+    "point2d": [
+      15.296813011169434,
+      -11.317105293273926
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 748,
+    "title": "C-STANCE: A Large Dataset for Chinese Zero-Shot Stance Detection",
+    "abstract": "Zero-shot stance detection (ZSSD) aims to determine whether the author of a text is in favor of, against, or neutral toward a target that is unseen during training. Despite the growing attention on ZSSD, most recent advances in this task are limited to English and do not pay much attention to other languages such as Chinese. To support ZSSD research, in this paper, we present C-STANCE that, to our knowledge, is the first Chinese dataset for zero-shot stance detection. We introduce two challenging subtasks for ZSSD: target-based ZSSD and domain-based ZSSD. Our dataset includes both noun-phrase targets and claim targets, covering a wide range of domains. We provide a detailed description and analysis of our dataset. To establish results on C-STANCE, we report performance scores using state-of-the-art deep learning models. We publicly release our dataset and code to facilitate future research.",
+    "authors": [
+      "Chenye Zhao",
+      "Yingjie Li",
+      "Cornelia Caragea"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.747",
+    "point2d": [
+      49.882423400878906,
+      37.956119537353516
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 749,
+    "title": "Wukong-Reader: Multi-modal Pre-training for Fine-grained Visual Document Understanding",
+    "abstract": "Unsupervised pre-training on millions of digital-born or scanned documents has shown promising advances in visual document understanding (VDU). While various vision-language pre-training objectives are studied in existing solutions, the document textline, as an intrinsic granularity in VDU, has seldom been explored so far. A document textline usually contains words that are spatially and semantically correlated, which can be easily obtained from OCR engines. In this paper, we propose Wukong-Reader, trained with new pre-training objectives to leverage the structural knowledge nested in document textlines. We introduce textline-region contrastive learning to achieve fine-grained alignment between the visual regions and texts of document textlines. Furthermore, masked region modeling and textline-grid matching are also designed to enhance the visual and layout representations of textlines. Experiments show that Wukong-Reader brings superior performance on various VDU tasks in both English and Chinese. The fine-grained alignment over textlines also empowers Wukong-Reader with promising localization ability.",
+    "authors": [
+      "Haoli Bai",
+      "Zhiguang Liu",
+      "Xiaojun Meng",
+      "Li Wentao",
+      "Shuang Liu",
+      "Yifeng Luo",
+      "Nian Xie",
+      "Rongfu Zheng",
+      "Liangwei Wang",
+      "Lu Hou",
+      "Jiansheng Wei",
+      "Xin Jiang",
+      "Qun Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.748",
+    "point2d": [
+      -49.75861358642578,
+      45.883888244628906
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 750,
+    "title": "PaCE: Unified Multi-modal Dialogue Pre-training with Progressive and Compositional Experts",
+    "abstract": "Perceiving multi-modal information and fulfilling dialogues with humans is a long-term goal of artificial intelligence. Pre-training is commonly regarded as an effective approach for multi-modal dialogue. However, due to the limited availability of multi-modal dialogue data, there is still scarce research on multi-modal dialogue pre-training. Yet another intriguing challenge emerges from the encompassing nature of multi-modal dialogue, which involves various modalities and tasks. Moreover, new forms of tasks may arise at unpredictable points in the future. Hence, it is essential for designed multi-modal dialogue models to possess sufficient flexibility to adapt to such scenarios. This paper proposes PaCE, a unified, structured, compositional multi-modal dialogue pre-training framework. It utilizes a combination of several fundamental experts to accommodate multiple dialogue-related tasks and can be pre-trained using limited dialogue and extensive non-dialogue multi-modal data. Furthermore, we propose a progressive training method where old experts from the past can assist new experts, facilitating the expansion of their capabilities. Experimental results demonstrate that PaCE achieves state-of-the-art results on eight multi-modal dialog benchmarks.",
+    "authors": [
+      "Yunshui Li",
+      "Binyuan Hui",
+      "ZhiChao Yin",
+      "Min Yang",
+      "Fei Huang",
+      "Yongbin Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.749",
+    "point2d": [
+      8.887083053588867,
+      72.48029327392578
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 751,
+    "title": "MVP-Tuning: Multi-View Knowledge Retrieval with Prompt Tuning for Commonsense Reasoning",
+    "abstract": "Recent advances in pre-trained language models (PLMs) have facilitated the development ofcommonsense reasoning tasks. However, existing methods rely on multi-hop knowledgeretrieval and thus suffer low accuracy due toembedded noise in the acquired knowledge.In addition, these methods often attain highcomputational costs and nontrivial knowledgeloss because they encode the knowledge independently of the PLM, making it less relevant to the task and thus resulting in a poorlocal optimum. In this work, we propose MultiView Knowledge Retrieval with Prompt Tuning (MVP-Tuning). MVP-Tuning leveragessimilar question-answer pairs in the training setto improve knowledge retrieval and employsa single prompt-tuned PLM to model knowledge and input text jointly. We conduct our experiments on five commonsense reasoning QAbenchmarks to show that MVP-Tuning outperforms all other baselines in 4 out of 5 datasetswith less than 2% trainable parameters. MVPTuning even gets a new state-of-the-art resulton OpenBookQA and is number one on theleaderboard.",
+    "authors": [
+      "Yongfeng Huang",
+      "Yanyang Li",
+      "Yichong Xu",
+      "Lin Zhang",
+      "Ruyi Gan",
+      "Jiaxing Zhang",
+      "Liwei Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.750",
+    "point2d": [
+      67.521484375,
+      1.9493733644485474
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 752,
+    "title": "PEIT: Bridging the Modality Gap with Pre-trained Models for End-to-End Image Translation",
+    "abstract": "Image translation is a task that translates an image containing text in the source language to the target language. One major challenge with image translation is the modality gap between visual text inputs and textual inputs/outputs of machine translation (MT). In this paper, we propose PEIT, an end-to-end image translation framework that bridges the modality gap with pre-trained models. It is composed of four essential components: a visual encoder, a shared encoder-decoder backbone network, a vision-text representation aligner equipped with the shared encoder and a cross-modal regularizer stacked over the shared decoder. Both the aligner and regularizer aim at reducing the modality gap. To train PEIT, we employ a two-stage pre-training strategy with an auxiliary MT task: (1) pre-training the MT model on the MT training data to initialize the shared encoder-decoder backbone network; and (2) pre-training PEIT with the aligner and regularizer on a synthesized dataset with rendered images containing text from the MT training data. In order to facilitate the evaluation of PEIT and promote research on image translation, we create a large-scale image translation corpus ECOIT containing 480K image-translation pairs via crowd-sourcing and manual post-editing from real-world images in the e-commerce domain. Experiments on the curated ECOIT benchmark dataset demonstrate that PEIT substantially outperforms both cascaded image translation systems (OCR+MT) and previous strong end-to-end image translation model, with fewer parameters and faster decoding speed.",
+    "authors": [
+      "Shaolin Zhu",
+      "Shangjie Li",
+      "Yikun Lei",
+      "Deyi Xiong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.751",
+    "point2d": [
+      -67.67948150634766,
+      40.478546142578125
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 753,
+    "title": "Topic-Guided Sampling For Data-Efficient Multi-Domain Stance Detection",
+    "abstract": "The task of Stance Detection is concerned with identifying the attitudes expressed by an author towards a target of interest. This task spans a variety of domains ranging from social media opinion identification to detecting the stance for a legal claim. However, the framing of the task varies within these domains in terms of the data collection protocol, the label dictionary and the number of available annotations. Furthermore, these stance annotations are significantly imbalanced on a per-topic and inter-topic basis. These make multi-domain stance detection challenging, requiring standardization and domain adaptation. To overcome this challenge, we propose Topic Efficient StancE Detection (TESTED), consisting of a topic-guided diversity sampling technique used for creating a multi-domain data efficient training set and a contrastive objective that is used for fine-tuning a stance classifier using the produced set. We evaluate the method on an existing benchmark of 16 datasets with in-domain, i.e. all topics seen and out-of-domain, i.e. unseen topics, experiments. The results show that the method outperforms the state-of-the-art with an average of 3.5 F1 points increase in-domain and is more generalizable with an averaged 10.2 F1 on out-of-domain evaluation while using <10% of the training data. We show that our sampling technique mitigates both inter- and per-topic class imbalances. Finally, our analysis demonstrates that the contrastive learning objective allows the model for a more pronounced segmentation of samples with varying labels.",
+    "authors": [
+      "Erik Arakelyan",
+      "Arnav Arora",
+      "Isabelle Augenstein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.752",
+    "point2d": [
+      48.51138687133789,
+      39.1632194519043
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 754,
+    "title": "DiSCoMaT: Distantly Supervised Composition Extraction from Tables in Materials Science Articles",
+    "abstract": "A crucial component in the curation of KB for a scientific domain (e.g., materials science, food & nutrition, fuels) is information extraction from tables in the domain\u2019s published research articles. To facilitate research in this direction, we define a novel NLP task of extracting compositions of materials (e.g., glasses) from tables in materials science papers. The task involves solving several challenges in concert, such as tables that mention compositions have highly varying structures; text in captions and full paper needs to be incorporated along with data in tables; and regular languages for numbers, chemical compounds, and composition expressions must be integrated into the model. We release a training dataset comprising 4,408 distantly supervised tables, along with 1,475 manually annotated dev and test tables. We also present DiSCoMaT, a strong baseline that combines multiple graph neural networks with several task-specific regular expressions, features, and constraints. We show that DiSCoMaT outperforms recent table processing architectures by significant margins. We release our code and data for further research on this challenging IE task from scientific tables.",
+    "authors": [
+      "Tanishq Gupta",
+      "Mohd Zaki",
+      "Devanshi Khatsuriya",
+      "Kausik Hira",
+      "N M Anoop Krishnan",
+      "Mausam -"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.753",
+    "point2d": [
+      29.20009994506836,
+      -26.657211303710938
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 755,
+    "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
+    "abstract": "Large \u201cinstruction-tuned\u201d language models (i.e., finetuned to respond to instructions) have demonstrated a remarkable ability to generalize zero-shot to new tasks. Nevertheless, they depend heavily on human-written instruction data that is often limited in quantity, diversity, and creativity, therefore hindering the generality of the tuned model. We introduce Self-Instruct, a framework for improving the instruction-following capabilities of pretrained language models by bootstrapping off their own generations. Our pipeline generates instructions, input, and output samples from a language model, then filters invalid or similar ones before using them to finetune the original model. Applying our method to the vanilla GPT3, we demonstrate a 33% absolute improvement over the original model on Super-NaturalInstructions, on par with the performance of InstructGPT-001, which was trained with private user data and human annotations. For further evaluation, we curate a set of expert-written instructions for novel tasks, and show through human evaluation that tuning GPT3 with Self-Instruct outperforms using existing public instruction datasets by a large margin, leaving only a 5% absolute gap behind InstructGPT-001. Self-Instruct provides an almost annotation-free method for aligning pre-trained language models with instructions, and we release our large synthetic dataset to facilitate future studies on instruction tuning.",
+    "authors": [
+      "Yizhong Wang",
+      "Yeganeh Kordi",
+      "Swaroop Mishra",
+      "Alisa Liu",
+      "Noah A. Smith",
+      "Daniel Khashabi",
+      "Hannaneh Hajishirzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.754",
+    "point2d": [
+      -19.779441833496094,
+      -17.873971939086914
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 756,
+    "title": "Disentangled Phonetic Representation for Chinese Spelling Correction",
+    "abstract": "Chinese Spelling Correction (CSC) aims to detect and correct erroneous characters in Chinese texts. Although efforts have been made to introduce phonetic information (Hanyu Pinyin) in this task, they typically merge phonetic representations with character representations, which tends to weaken the representation effect of normal texts. In this work, we propose to disentangle the two types of features to allow for direct interaction between textual and phonetic information. To learn useful phonetic representations, we introduce a pinyin-to-character objective to ask the model to predict the correct characters based solely on phonetic information, where a separation mask is imposed to disable attention from phonetic input to text. To avoid overfitting the phonetics, we further design a self-distillation module to ensure that semantic information plays a major role in the prediction. Extensive experiments on three CSC benchmarks demonstrate the superiority of our method in using phonetic information.",
+    "authors": [
+      "Zihong Liang",
+      "Xiaojun Quan",
+      "Qifan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.755",
+    "point2d": [
+      -38.984840393066406,
+      8.370443344116211
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 757,
+    "title": "Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis",
+    "abstract": "Length extrapolation permits training a transformer language model on short sequences that preserves perplexities when tested on substantially longer sequences.A relative positional embedding design, ALiBi, has had the widest usage to date. We dissect ALiBi via the lens of receptive field analysis empowered by a novel cumulative normalized gradient tool. The concept of receptive field further allows us to modify the vanilla Sinusoidal positional embedding to create Sandwich, the first parameter-free relative positional embedding design that truly length information uses longer than the training sequence. Sandwich shares with KERPLE and T5 the same logarithmic decaying temporal bias pattern with learnable relative positional embeddings; these elucidate future extrapolatable positional embedding design.",
+    "authors": [
+      "Ta-Chung Chi",
+      "Ting-Han Fan",
+      "Alexander Rudnicky",
+      "Peter Ramadge"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.756",
+    "point2d": [
+      -39.8692626953125,
+      -28.936803817749023
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 758,
+    "title": "CHBias: Bias Evaluation and Mitigation of Chinese Conversational Language Models",
+    "abstract": "\n        \n          redWarning: This paper contains content that may be offensive or upsetting.Pretrained conversational agents have been exposed to safety issues, exhibiting a range of stereotypical human biases such as gender bias. However, there are still limited bias categories in current research, and most of them only focus on English. In this paper, we introduce a new Chinese dataset, CHBias, for bias evaluation and mitigation of Chinese conversational language models.Apart from those previous well-explored bias categories, CHBias includes under-explored bias categories, such as ageism and appearance biases, which received less attention. We evaluate two popular pretrained Chinese conversational models, CDial-GPT and EVA2.0, using CHBias. Furthermore, to mitigate different biases, we apply several debiasing methods to the Chinese pretrained models. Experimental results show that these Chinese pretrained models are potentially risky for generating texts that contain social biases, and debiasing methods using the proposed dataset can make response generation less biased while preserving the models\u2019 conversational capabilities.",
+    "authors": [
+      "Jiaxu Zhao",
+      "Meng Fang",
+      "Zijing Shi",
+      "Yitong Li",
+      "Ling Chen",
+      "Mykola Pechenizkiy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.757",
+    "point2d": [
+      17.441862106323242,
+      33.10116195678711
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 759,
+    "title": "Learning New Skills after Deployment: Improving open-domain internet-driven dialogue with human feedback",
+    "abstract": "Frozen models trained to mimic static datasets can never improve their performance. Models that can employ internet-retrieval for up-to-date information and obtain feedback from humans during deployment provide the promise of both adapting to new information, and improving their performance. In this work we study how to improve internet-driven conversational skills in such a learning framework. We collect deployment data, which we make publicly available, of human interactions, and collect various types of human feedback \u2013 including binary quality measurements, free-form text feedback, and fine-grained reasons for failure. We then study various algorithms for improving from such feedback, including standard supervised learning, rejection sampling, model-guiding and reward-based learning, in order to make recommendations on which type of feed- back and algorithms work best. We find the recently introduced DIRECTOR model (Arora et al., 2022) shows significant improvements over other existing approaches.",
+    "authors": [
+      "Jing Xu",
+      "Megan Ung",
+      "Mojtaba Komeili",
+      "Kushal Arora",
+      "Y-Lan Boureau",
+      "Jason Weston"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.758",
+    "point2d": [
+      16.121742248535156,
+      60.83510208129883
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 760,
+    "title": "Uncovering and Categorizing Social Biases in Text-to-SQL",
+    "abstract": "Large pre-trained language models are acknowledged to carry social bias towards different demographics, which can further amplify existing stereotypes in our society and cause even more harm. Text-to-SQL is an important task, models of which are mainly adopted by administrative industries, where unfair decisions may lead to catastrophic consequences. However, existing Text-to-SQL models are trained on clean, neutral datasets, such as Spider and WikiSQL. This, to some extent, cover up social bias in models under ideal conditions, which nevertheless may emerge in real application scenarios. In this work, we aim to uncover and mitigate social bias in Text-to-SQL models. We summarize the categories of social bias that may occur in structural data for Text-to-SQL models. We build test benchmarks and reveal that models with similar task accuracy can contain social bias at very different rates. We show how to take advantage of our methodology to assess and mitigate social bias in the downstream Text-to-SQL task.",
+    "authors": [
+      "Yan Liu",
+      "Yan Gao",
+      "Zhe Su",
+      "Xiaokang Chen",
+      "Elliott Ash",
+      "Jian-Guang Lou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.759",
+    "point2d": [
+      18.83458709716797,
+      28.896007537841797
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 761,
+    "title": "On the Compositional Generalization in Versatile Open-domain Dialogue",
+    "abstract": "Previous research has demonstrated the potential of multi-task learning to foster a conversational agent\u2019s ability to acquire a variety of skills. However, these approaches either suffer from interference among different datasets (also known as negative transfer), or fail to effectively reuse knowledge and skills learned from other datasets. In contrast to previous works, we develop a sparsely activated modular network: (1) We propose a well-rounded set of operators and instantiate each operator with an independent module; (2) We formulate dialogue generation as the execution of a generated programme which recursively composes and assembles modules. Extensive experiments on 9 datasets verify the efficacy of our methods through automatic evaluation and human evaluation. Notably, our model outperforms state-of-the-art supervised approaches on 4 datasets with only 10% training data thanks to the modular architecture and multi-task learning.",
+    "authors": [
+      "Tingchen Fu",
+      "Xueliang Zhao",
+      "Lemao Liu",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.760",
+    "point2d": [
+      8.280484199523926,
+      71.37326049804688
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 762,
+    "title": "What is the Real Intention behind this Question? Dataset Collection and Intention Classification",
+    "abstract": "Asking and answering questions are inseparable parts of human social life. The primary purposes of asking questions are to gain knowledge or request help which has been the subject of question-answering studies. However, questions can also reflect negative intentions and include implicit offenses, such as highlighting one\u2019s lack of knowledge or bolstering an alleged superior knowledge, which can lead to conflict in conversations; yet has been scarcely researched. This paper is the first study to introduce a dataset (Question Intention Dataset) that includes questions with positive/neutral and negative intentions and the underlying intention categories within each group. We further conduct a meta-analysis to highlight tacit and apparent intents. We also propose a classification method using Transformers augmented by TF-IDF-based features and report the results of several models for classifying the main intention categories. We aim to highlight the importance of taking intentions into account, especially implicit and negative ones, to gain insight into conflict-evoking questions and better understand human-human communication on the web for NLP applications.",
+    "authors": [
+      "Maryam Sadat Mirzaei",
+      "Kourosh Meshgi",
+      "Satoshi Sekine"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.761",
+    "point2d": [
+      19.957714080810547,
+      40.20896530151367
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 763,
+    "title": "Conjunct Resolution in the Face of Verbal Omissions",
+    "abstract": "Verbal omissions are complex syntactic phenomena in VP coordination structures. They occur when verbs and (some of) their arguments are omitted from subsequent clauses after being explicitly stated in an initial clause. Recovering these omitted elements is necessary for accurate interpretation of the sentence, and while humans easily and intuitively fill in the missing information, state-of-the-art models continue to struggle with this task. Previous work is limited to small-scale datasets, synthetic data creation methods, and to resolution methods in the dependency-graph level. In this work we propose a conjunct resolution task that operates directly on the text and makes use of a split-and-rephrase paradigm in order to recover the missing elements in the coordination structure. To this end, we first formulate a pragmatic framework of verbal omissions which describes the different types of omissions, and develop an automatic scalable collection method. Based on this method, we curate a large dataset, containing over 10K examples of naturally-occurring verbal omissions with crowd-sourced annotations of the resolved conjuncts. We train various neural baselines for this task, and show that while our best method obtains decent performance, it leaves ample space for improvement. We propose our dataset, metrics and models as a starting point for future research on this topic.",
+    "authors": [
+      "Royi Rassin",
+      "Yoav Goldberg",
+      "Reut Tsarfaty"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.762",
+    "point2d": [
+      -17.61840057373047,
+      -71.63758850097656
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 764,
+    "title": "Training Models to Generate, Recognize, and Reframe Unhelpful Thoughts",
+    "abstract": "Many cognitive approaches to well-being, such as recognizing and reframing unhelpful thoughts, have received considerable empirical support over the past decades, yet still lack truly widespread adoption in self-help format. A barrier to that adoption is a lack of adequately specific and diverse dedicated practice material. This work examines whether current language models can be leveraged to both produce a virtually unlimited quantity of practice material illustrating standard unhelpful thought patterns matching specific given contexts, and generate suitable positive reframing proposals. We propose PATTERNREFRAME, a novel dataset of about 10k examples of thoughts containing unhelpful thought patterns conditioned on a given persona, accompanied by about 27k positive reframes. By using this dataset to train and/or evaluate current models, we show that existing models can already be powerful tools to help generate an abundance of tailored practice material and hypotheses, with no or minimal additional model training required.",
+    "authors": [
+      "Mounica Maddela",
+      "Megan Ung",
+      "Jing Xu",
+      "Andrea Madotto",
+      "Heather Foran",
+      "Y-Lan Boureau"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.763",
+    "point2d": [
+      36.078025817871094,
+      69.6961441040039
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 765,
+    "title": "Learning In-context Learning for Named Entity Recognition",
+    "abstract": "Named entity recognition in real-world applications suffers from the diversity of entity types, the emergence of new entity types, and the lack of high-quality annotations. To address the above problems, this paper proposes an in-context learning-based NER approach, which can effectively inject in-context NER ability into PLMs and recognize entities of novel types on-the-fly using only a few demonstrative instances. Specifically, we model PLMs as a meta-function Lambda_instruction, demonstrations, text.M, and a new entity extractor can be implicitly constructed by applying new instruction and demonstrations to PLMs, i.e., (Lambda . M) (instruction, demonstrations) ->F where F will be a new entity extractor F: text -> entities. To inject the above in-context NER ability into PLMs, we propose a meta-function pre-training algorithm, which pre-trains PLMs by comparing the (instruction, demonstration)-initialized extractor with a surrogate golden extractor. Experimental results on 4 few-shot NER datasets show that our method can effectively inject in-context NER ability into PLMs and significantly outperforms the PLMs+fine-tuning counterparts.",
+    "authors": [
+      "Jiawei Chen",
+      "Yaojie Lu",
+      "Hongyu Lin",
+      "Jie Lou",
+      "Wei Jia",
+      "Dai Dai",
+      "Hua Wu",
+      "Boxi Cao",
+      "Xianpei Han",
+      "Le Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.764",
+    "point2d": [
+      32.426727294921875,
+      -84.58000183105469
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 766,
+    "title": "Holistic Prediction on a Time-Evolving Attributed Graph",
+    "abstract": "Graph-based prediction is essential in NLP tasks such as temporal knowledge graph completion. A cardinal question in this field is, how to predict the future links, nodes, and attributes of a time-evolving attributed graph? Unfortunately, existing techniques assume that each link, node, and attribute prediction is independent, and fall short of predicting the appearance of new nodes that were not observed in the past. In this paper, we address two interrelated questions; (1) can we exploit task interdependence to improve prediction accuracy? and (2) can we predict new nodes with their attributes? We propose a unified framework that predicts node attributes and topology changes such as the appearance and disappearance of links and the emergence and loss of nodes. This frame-work comprises components for independent and interactive prediction and for predicting new nodes. Our experimental study using real-world data confirms that our interdependent prediction framework achieves higher accuracy than methods based on independent prediction.",
+    "authors": [
+      "Shohei Yamasaki",
+      "Yuya Sasaki",
+      "Panagiotis Karras",
+      "Makoto Onizuka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.765",
+    "point2d": [
+      59.468048095703125,
+      -63.54537582397461
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 767,
+    "title": "Modeling Instance Interactions for Joint Information Extraction with Neural High-Order Conditional Random Field",
+    "abstract": "Prior works on joint Information Extraction (IE) typically model instance (e.g., event triggers, entities, roles, relations) interactions by representation enhancement, type dependencies scoring, or global decoding. We find that the previous models generally consider binary type dependency scoring of a pair of instances, and leverage local search such as beam search to approximate global solutions. To better integrate cross-instance interactions, in this work, we introduce a joint IE framework (CRFIE) that formulates joint IE as a high-order Conditional Random Field. Specifically, we design binary factors and ternary factors to directly model interactions between not only a pair of instances but also triplets. Then, these factors are utilized to jointly predict labels of all instances.To address the intractability problem of exact high-order inference, we incorporate a high-order neural decoder that is unfolded from a mean-field variational inference method, which achieves consistent learning and inference. The experimental results show that our approach achieves consistent improvements on three IE tasks compared with our baseline and prior work.",
+    "authors": [
+      "Zixia Jia",
+      "Zhaohui Yan",
+      "Wenjuan Han",
+      "Zilong Zheng",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.766",
+    "point2d": [
+      44.952545166015625,
+      -52.94858169555664
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 768,
+    "title": "Training Trajectories of Language Models Across Scales",
+    "abstract": "Scaling up language models has led to unprecedented performance gains, but little is understood about how the training dynamics change as models get larger. How do language models of different sizes learn during pre-training? Why do larger language models demonstrate more desirable behaviors? In this paper, we analyze the intermediate training checkpoints of differently sized OPT models (Zhang et al., 2022)\u2014from 125M to 175B parameters\u2014on next-token prediction, sequence-level generation and downstream tasks. We find that 1) at a given perplexity and independent of model sizes, a similar subset of training tokens see the most significant reduction in loss, with the rest stagnating or showing double-descent behavior (Nakkiran et al., 2020); 2) early in training, all models learn to reduce the perplexity of grammatical sequences that contain hallucinations, with small models halting at this suboptimal distribution and larger ones eventually learning to assign these sequences lower probabilities; and 3) perplexity is a strong predictor of in-context learning performance on 74 multiple-choice tasks from BIG-Bench, and this holds independent of the model size. Together, these results show that perplexity is more predictive of model behaviors than model size or training computation.",
+    "authors": [
+      "Mengzhou Xia",
+      "Mikel Artetxe",
+      "Chunting Zhou",
+      "Xi Victoria Lin",
+      "Ramakanth Pasunuru",
+      "Danqi Chen",
+      "Luke Zettlemoyer",
+      "Veselin Stoyanov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.767",
+    "point2d": [
+      -26.294414520263672,
+      -16.903596878051758
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 769,
+    "title": "A Diverse Set of Freely Available Linguistic Resources for Turkish",
+    "abstract": "This study presents a diverse set of freely available linguistic resources for Turkish natural language processing, including corpora, pretrained models and education material. Although Turkish is spoken by a sizeable population of over 80 million people, Turkish linguistic resources for natural language processing remain scarce. In this study, we provide corpora to allow practitioners to build their own applications and pretrained models that would assist industry researchers in creating quick prototypes. The provided corpora include named entity recognition datasets of diverse genres, including Wikipedia articles and supplement products customer reviews. In addition, crawling e-commerce and movie reviews websites, we compiled several sentiment analysis datasets of different genres. Our linguistic resources for Turkish also include pretrained spaCy language models. To the best of our knowledge, our models are the first spaCy models trained for the Turkish language. Finally, we provide various types of education material, such as video tutorials and code examples, that can support the interested audience on practicing Turkish NLP. The advantages of our linguistic resources are three-fold: they are freely available, they are first of their kind, and they are easy to use in a broad range of implementations. Along with a thorough description of the resource creation process, we also explain the position of our resources in the Turkish NLP world.",
+    "authors": [
+      "Duygu Altinok"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.768",
+    "point2d": [
+      -19.226016998291016,
+      -37.51917266845703
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 770,
+    "title": "Measuring Consistency in Text-based Financial Forecasting Models",
+    "abstract": "Financial forecasting has been an important and active area of machine learning research, as even the most modest advantages in predictive accuracy can be parlayed into significant financial gains. Recent advances in natural language processing (NLP) bring the opportunity to leverage textual data, such as earnings reports of publicly traded companies, to predict the return rate for an asset. However, when dealing with such a sensitive task, the consistency of models \u2013 their invariance under meaning-preserving alternations in input \u2013 is a crucial property for building user trust. Despite this, current methods for financial forecasting do not take consistency into consideration. To address this issue, we propose FinTrust, an evaluation tool that assesses logical consistency in financial text. Using FinTrust, we show that the consistency of state-of-the-art NLP models for financial forecasting is poor. Our analysis of the performance degradation caused by meaning-preserving alternations suggests that current text-based methods are not suitable for robustly predicting market information.",
+    "authors": [
+      "Linyi Yang",
+      "Yingpeng Ma",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.769",
+    "point2d": [
+      34.8273811340332,
+      3.253143072128296
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 771,
+    "title": "Optimal Transport for Unsupervised Hallucination Detection in Neural Machine Translation",
+    "abstract": "Neural machine translation (NMT) has become the de-facto standard in real-world machine translation applications. However, NMT models can unpredictably produce severely pathological translations, known as hallucinations, that seriously undermine user trust. It becomes thus crucial to implement effective preventive strategies to guarantee their proper functioning. In this paper, we address the problem of hallucination detection in NMT by following a simple intuition: as hallucinations are detached from the source content, they exhibit encoder-decoder attention patterns that are statistically different from those of good quality translations. We frame this problem with an optimal transport formulation and propose a fully unsupervised, plug-in detector that can be used with any attention-based NMT model. Experimental results show that our detector not only outperforms all previous model-based detectors, but is also competitive with detectors that employ external models trained on millions of samples for related tasks such as quality estimation and cross-lingual sentence similarity.",
+    "authors": [
+      "Nuno M. Guerreiro",
+      "Pierre Colombo",
+      "Pablo Piantanida",
+      "Andr\u00e9 Martins"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.770",
+    "point2d": [
+      -76.21363830566406,
+      -10.874163627624512
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 772,
+    "title": "RankCSE: Unsupervised Sentence Representations Learning via Learning to Rank",
+    "abstract": "Unsupervised sentence representation learning is one of the fundamental problems in natural language processing with various downstream applications. Recently, contrastive learning has been widely adopted which derives high-quality sentence representations by pulling similar semantics closer and pushing dissimilar ones away. However, these methods fail to capture the fine-grained ranking information among the sentences, where each sentence is only treated as either positive or negative. In many real-world scenarios, one needs to distinguish and rank the sentences based on their similarities to a query sentence, e.g., very relevant, moderate relevant, less relevant, irrelevant, etc. In this paper, we propose a novel approach, RankCSE, for unsupervised sentence representation learning, which incorporates ranking consistency and ranking distillation with contrastive learning into a unified framework. In particular, we learn semantically discriminative sentence representations by simultaneously ensuring ranking consistency between two representations with different dropout masks, and distilling listwise ranking knowledge from the teacher. An extensive set of experiments are conducted on both semantic textual similarity (STS) and transfer (TR) tasks. Experimental results demonstrate the superior performance of our approach over several state-of-the-art baselines.",
+    "authors": [
+      "Jiduan Liu",
+      "Jiahao Liu",
+      "Qifan Wang",
+      "Jingang Wang",
+      "Wei Wu",
+      "Yunsen Xian",
+      "Dongyan Zhao",
+      "Kai Chen",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.771",
+    "point2d": [
+      2.607167959213257,
+      -33.53433609008789
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 773,
+    "title": "Entailment as Robust Self-Learner",
+    "abstract": "Entailment has been recognized as an important metric for evaluating natural language understanding (NLU) models, and recent studies have found that entailment pretraining benefits weakly supervised fine-tuning. In this work, we design a prompting strategy that formulates a number of different NLU tasks as contextual entailment. This approach improves the zero-shot adaptation of pretrained entailment models. Secondly, we notice that self-training entailment-based models with unlabeled data can significantly improve the adaptation performance on downstream tasks. To achieve more stable improvement, we propose the Simple Pseudo-Label Editing (SimPLE) algorithm for better pseudo-labeling quality in self-training. We also found that both pretrained entailment-based models and the self-trained models are robust against adversarial evaluation data. Experiments on binary and multi-class classification tasks show that SimPLE leads to more robust self-training results, indicating that the self-trained entailment models are more efficient and trustworthy than large language models on language understanding tasks.",
+    "authors": [
+      "Jiaxin Ge",
+      "Hongyin Luo",
+      "Yoon Kim",
+      "James Glass"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.772",
+    "point2d": [
+      -10.829401016235352,
+      -9.748753547668457
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 774,
+    "title": "ReCode: Robustness Evaluation of Code Generation Models",
+    "abstract": "Code generation models have achieved impressive performance. However, they tend to be brittle as slight edits to a prompt could lead to very different generations; these robustness properties, critical for user experience when deployed in real-life applications, are not well understood. Most existing works on robustness in text or code tasks have focused on classification, while robustness in generation tasks is an uncharted area and to date there is no comprehensive benchmark for robustness in code generation. In this paper, we propose ReCode, a comprehensive robustness evaluation benchmark for code generation models. We customize over 30 transformations specifically for code on docstrings, function and variable names, code syntax, and code format. They are carefully designed to be natural in real-life coding practice, preserve the original semantic meaning, and thus provide multifaceted assessments of a model\u2019s robustness performance. With human annotators, we verified that over 90% of the perturbed prompts do not alter the semantic meaning of the original prompt. In addition, we define robustness metrics for code generation models considering the worst-case behavior under each type of perturbation, taking advantage of the fact that executing the generated code can serve as objective evaluation. We demonstrate ReCode on SOTA models using HumanEval, MBPP, as well as function completion tasks derived from them. Interesting observations include: better robustness for CodeGen over InCoder and GPT-J; models are most sensitive to syntax perturbations; more challenging robustness evaluation on MBPP over HumanEval.",
+    "authors": [
+      "Shiqi Wang",
+      "Zheng Li",
+      "Haifeng Qian",
+      "Chenghao Yang",
+      "Zijian Wang",
+      "Mingyue Shang",
+      "Varun Kumar",
+      "Samson Tan",
+      "Baishakhi Ray",
+      "Parminder Bhatia",
+      "Ramesh Nallapati",
+      "Murali Krishna Ramanathan",
+      "Dan Roth",
+      "Bing Xiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.773",
+    "point2d": [
+      -8.676005363464355,
+      -53.83512496948242
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 775,
+    "title": "EPIC: Multi-Perspective Annotation of a Corpus of Irony",
+    "abstract": "We present EPIC (English Perspectivist Irony Corpus), the first annotated corpus for irony analysis based on the principles of data perspectivism. The corpus contains short conversations from social media in five regional varieties of English, and it is annotated by contributors from five countries corresponding to those varieties. We analyse the resource along the perspectives induced by the diversity of the annotators, in terms of origin, age, and gender, and the relationship between these dimensions, irony, and the topics of conversation. We validate EPIC by creating perspective-aware models that encode the perspectives of annotators grouped according to their demographic characteristics. Firstly, the performance of perspectivist models confirms that different annotators induce very different models. Secondly, in the classification of ironic and non-ironic texts, perspectivist models prove to be generally more confident than the non-perspectivist ones. Furthermore, comparing the performance on a perspective-based test set with those achieved on a gold standard test set, we can observe how perspectivist models tend to detect more precisely the positive class, showing their ability to capture the different perceptions of irony. Thanks to these models, we are moreover able to show interesting insights about the variation in the perception of irony by the different groups of annotators, such as among different generations and nationalities.",
+    "authors": [
+      "Simona Frenda",
+      "Alessandro Pedrani",
+      "Valerio Basile",
+      "Soda Marem Lo",
+      "Alessandra Teresa Cignarella",
+      "Raffaella Panizzon",
+      "Cristina Marco",
+      "Bianca Scarlini",
+      "Viviana Patti",
+      "Cristina Bosco",
+      "Davide Bernardi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.774",
+    "point2d": [
+      36.607513427734375,
+      36.961669921875
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 776,
+    "title": "Dialogue Summarization with Static-Dynamic Structure Fusion Graph",
+    "abstract": "Dialogue, the most fundamental and specially privileged arena of language, gains increasing ubiquity across the Web in recent years. Quickly going through the long dialogue context and capturing salient information scattered over the whole dialogue session benefit users in many real-world Web applications such as email thread summarization and meeting minutes draft. Dialogue summarization is a challenging task in that dialogue has dynamic interaction nature and presumably inconsistent information flow among various speakers. Many researchers address this task by modeling dialogue with pre-computed static graph structure using external linguistic toolkits. However, such methods heavily depend on the reliability of external tools and the static graph construction is disjoint with the graph representation learning phase, which makes the graph can\u2019t be dynamically adapted for the downstream summarization task. In this paper, we propose a Static-Dynamic graph-based Dialogue Summarization model (SDDS), which fuses prior knowledge from human expertise and adaptively learns the graph structure in an end-to-end learning fashion. To verify the effectiveness of SDDS, we conduct experiments on three benchmark datasets (SAMSum, MediaSum, and DialogSum) and the results verify the superiority of SDDS.",
+    "authors": [
+      "Shen Gao",
+      "Xin Cheng",
+      "Mingzhe Li",
+      "Xiuying Chen",
+      "Jinpeng Li",
+      "Dongyan Zhao",
+      "Rui Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.775",
+    "point2d": [
+      -2.1461148262023926,
+      53.84083557128906
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 777,
+    "title": "Large-Scale Correlation Analysis of Automated Metrics for Topic Models",
+    "abstract": "Automated coherence metrics constitute an important and popular way to evaluate topic models. Previous works present a mixed picture of their presumed correlation with human judgement. In this paper, we conduct a large-scale correlation analysis of coherence metrics. We propose a novel sampling approach to mine topics for the purpose of metric evaluation, and conduct the analysis via three large corpora showing that certain automated coherence metrics are correlated. Moreover, we extend the analysis to measure topical differences between corpora. Lastly, we examine the reliability of human judgement by conducting an extensive user study, which is designed as an amalgamation of different proxy tasks to derive a finer insight into the human decision-making processes. Our findings reveal some correlation between automated coherence metrics and human judgement, especially for generic corpora.",
+    "authors": [
+      "Jia Peng Lim",
+      "Hady Lauw"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.776",
+    "point2d": [
+      -10.232662200927734,
+      35.71009063720703
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 778,
+    "title": "U-CREAT: Unsupervised Case Retrieval using Events extrAcTion",
+    "abstract": "The task of Prior Case Retrieval (PCR) in the legal domain is about automatically citing relevant (based on facts and precedence) prior legal cases in a given query case. To further promote research in PCR, in this paper, we propose a new large benchmark (in English) for the PCR task: IL-PCR (Indian Legal Prior Case Retrieval) corpus. Given the complex nature of case relevance and the long size of legal documents, BM25 remains a strong baseline for ranking the cited prior documents. In this work, we explore the role of events in legal case retrieval and propose an unsupervised retrieval method-based pipeline U-CREAT (Unsupervised Case Retrieval using Events Extraction). We find that the proposed unsupervised retrieval method significantly increases performance compared to BM25 and makes retrieval faster by a considerable margin, making it applicable to real-time case retrieval systems. Our proposed system is generic, we show that it generalizes across two different legal systems (Indian and Canadian), and it shows state-of-the-art performance on the benchmarks for both the legal systems (IL-PCR and COLIEE corpora).",
+    "authors": [
+      "Abhinav Joshi",
+      "Akshat Sharma",
+      "Sai Kiran Tanikella",
+      "Ashutosh Modi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.777",
+    "point2d": [
+      47.17015075683594,
+      -41.75879669189453
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 779,
+    "title": "ArgAnalysis35K : A large-scale dataset for Argument Quality Analysis",
+    "abstract": "Argument Quality Detection is an emerging field in NLP which has seen significant recent development. However, existing datasets in this field suffer from a lack of quality, quantity and diversity of topics and arguments, specifically the presence of vague arguments that are not persuasive in nature. In this paper, we leverage a combined experience of 10+ years of Parliamentary Debating to create a dataset that covers significantly more topics and has a wide range of sources to capture more diversity of opinion. With 34,890 high-quality argument-analysis pairs (a term we introduce in this paper), this is also the largest dataset of its kind to our knowledge. In addition to this contribution, we introduce an innovative argument scoring system based on instance-level annotator reliability and propose a quantitative model of scoring the relevance of arguments to a range of topics.",
+    "authors": [
+      "Omkar Joshi",
+      "Priya Pitre",
+      "Yashodhara Haribhakta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.778",
+    "point2d": [
+      41.322662353515625,
+      39.0666618347168
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 780,
+    "title": "Reference Matters: Benchmarking Factual Error Correction for Dialogue Summarization with Fine-grained Evaluation Framework",
+    "abstract": "Factuality is important to dialogue summarization. Factual error correction (FEC) of model-generated summaries is one way to improve factuality. Current FEC evaluation that relies on factuality metrics is not reliable and detailed enough. To address this problem, we are the first to manually annotate a FEC dataset for dialogue summarization containing 4000 items and propose FERRANTI, a fine-grained evaluation framework based on reference correction that automatically evaluates the performance of FEC models on different error categories. Using this evaluation framework, we conduct sufficient experiments with FEC approaches under a variety of settings and find the best training modes and significant differences in the performance of the existing approaches on different factual error categories.",
+    "authors": [
+      "Mingqi Gao",
+      "Xiaojun Wan",
+      "Jia Su",
+      "Zhefeng Wang",
+      "Baoxing Huai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.779",
+    "point2d": [
+      -5.091953754425049,
+      51.36500549316406
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 781,
+    "title": "Minding Language Models\u2019 (Lack of) Theory of Mind: A Plug-and-Play Multi-Character Belief Tracker",
+    "abstract": "Theory of Mind (ToM)\u2014the ability to reason about the mental states of other people\u2014is a key element of our social intelligence. Yet, despite their ever more impressive performance, large-scale neural language models still lack basic theory of mind capabilities out-of-the-box. We posit that simply scaling up models will not imbue them with theory of mind due to the inherently symbolic and implicit nature of the phenomenon, and instead investigate an alternative: can we design a decoding-time algorithm that enhances theory of mind of off-the-shelf neural language models without explicit supervision? We present SymbolicToM, a plug-and-play approach to reason about the belief states of multiple characters in reading comprehension tasks via explicit symbolic representation. More concretely, our approach tracks each entity\u2019s beliefs, their estimation of other entities\u2019 beliefs, and higher-order levels of reasoning, all through graphical representations, allowing for more precise and interpretable reasoning than previous approaches. Empirical results on the well-known ToMi benchmark (Le et al., 2019) demonstrate that SymbolicToM dramatically enhances off-the-shelf neural networks\u2019 theory of mind in a zero-shot setting while showing robust out-of-distribution performance compared to supervised baselines. Our work also reveals spurious patterns in existing theory of mind benchmarks, emphasizing the importance of out-of-distribution evaluation and methods that do not overfit a particular dataset.",
+    "authors": [
+      "Melanie Sclar",
+      "Sachin Kumar",
+      "Peter West",
+      "Alane Suhr",
+      "Yejin Choi",
+      "Yulia Tsvetkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.780",
+    "point2d": [
+      39.33097839355469,
+      -8.962687492370605
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 782,
+    "title": "Don\u2019t Retrain, Just Rewrite: Countering Adversarial Perturbations by Rewriting Text",
+    "abstract": "Can language models transform inputs to protect text classifiers against adversarial attacks? In this work, we present ATINTER, a model that intercepts and learns to rewrite adversarial inputs to make them non-adversarial for a downstream text classifier. Our experiments on four datasets and five attack mechanisms reveal that ATINTER is effective at providing better adversarial robustness than existing defense approaches, without compromising task accuracy. For example, on sentiment classification using the SST-2 dataset, our method improves the adversarial accuracy over the best existing defense approach by more than 4% with a smaller decrease in task accuracy (0.5 % vs 2.5%). Moreover, we show that ATINTER generalizes across multiple downstream tasks and classifiers without having to explicitly retrain it for those settings. For example, we find that when ATINTER is trained to remove adversarial perturbations for the sentiment classification task on the SST-2 dataset, it even transfers to a semantically different task of news classification (on AGNews) and improves the adversarial robustness by more than 10%.",
+    "authors": [
+      "Ashim Gupta",
+      "Carter Blum",
+      "Temma Choji",
+      "Yingjie Fei",
+      "Shalin Shah",
+      "Alakananda Vempala",
+      "Vivek Srikumar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.781",
+    "point2d": [
+      3.9038009643554688,
+      6.900362491607666
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 783,
+    "title": "Aggregating Multiple Heuristic Signals as Supervision for Unsupervised Automated Essay Scoring",
+    "abstract": "Automated Essay Scoring (AES) aims to evaluate the quality score for input essays. In this work, we propose a novel unsupervised AES approach ULRA, which does not require groundtruth scores of essays for training. The core idea of our ULRA is to use multiple heuristic quality signals as the pseudo-groundtruth, and then train a neural AES model by learning from the aggregation of these quality signals. To aggregate these inconsistent quality signals into a unified supervision, we view the AES task as a ranking problem, and design a special Deep Pairwise Rank Aggregation (DPRA) loss for training. In the DPRA loss, we set a learnable confidence weight for each signal to address the conflicts among signals, and train the neural AES model in a pairwise way to disentangle the cascade effect among partial-order pairs. Experiments on eight prompts of ASPA dataset show that ULRA achieves the state-of-the-art performance compared with previous unsupervised methods in terms of both transductive and inductive settings. Further, our approach achieves comparable performance with many existing domain-adapted supervised models, showing the effectiveness of ULRA. The code is available at https://github.com/tenvence/ulra.",
+    "authors": [
+      "Cong Wang",
+      "Zhiwei Jiang",
+      "Yafeng Yin",
+      "Zifeng Cheng",
+      "Shiping Ge",
+      "Qing Gu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.782",
+    "point2d": [
+      -13.771958351135254,
+      25.833316802978516
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 784,
+    "title": "Mitigating Label Biases for In-context Learning",
+    "abstract": "Various design settings for in-context learning (ICL), such as the choice and order of the in-context examples, can bias the model\u2019s predictions. While many studies discuss these design choices, there have been few systematic investigations into categorizing them and mitigating their impact. In this work, we define a typology for three types of label biases in ICL for text classification: vanilla-label bias, context-label bias, and domain-label bias (which we conceptualize and detect for the first time). Our analysis demonstrates that prior label bias calibration methods fall short of addressing all three types of biases. Specifically, domain-label bias restricts LLMs to random-level performance on many tasks regardless of the choice of in-context examples. To mitigate the effect of these biases, we propose a simple bias calibration method that estimates a language model\u2019s label bias using random in-domain words from the task corpus. After controlling for this estimated bias when making predictions, our novel domain-context calibration significantly improves the ICL performance of GPT-J and GPT-3 on a wide range of tasks. The gain is substantial on tasks with large domain-label bias (up to 37% in Macro-F1). Furthermore, our results generalize to models with different scales, pretraining methods, and manually-designed task instructions, showing the prevalence of label biases in ICL.",
+    "authors": [
+      "Yu Fei",
+      "Yifan Hou",
+      "Zeming Chen",
+      "Antoine Bosselut"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.783",
+    "point2d": [
+      -13.168292999267578,
+      -23.296873092651367
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 785,
+    "title": "QUEST: A Retrieval Dataset of Entity-Seeking Queries with Implicit Set Operations",
+    "abstract": "Formulating selective information needs results in queries that implicitly specify set operations, such as intersection, union, and difference. For instance, one might search for \u201cshorebirds that are not sandpipers\u201d or \u201cscience-fiction films shot in England\u201d. To study the ability of retrieval systems to meet such information needs, we construct QUEST, a dataset of 3357 natural language queries with implicit set operations, that map to a set of entities corresponding to Wikipedia documents. The dataset challenges models to match multiple constraints mentioned in queries with corresponding evidence in documents and correctly perform various set operations. The dataset is constructed semi-automatically using Wikipedia category names. Queries are automatically composed from individual categories, then paraphrased and further validated for naturalness and fluency by crowdworkers. Crowdworkers also assess the relevance of entities based on their documents and highlight attribution of query constraints to spans of document text. We analyze several modern retrieval systems, finding that they often struggle on such queries. Queries involving negation and conjunction are particularly challenging and systems are further challenged with combinations of these operations.",
+    "authors": [
+      "Chaitanya Malaviya",
+      "Peter Shaw",
+      "Ming-Wei Chang",
+      "Kenton Lee",
+      "Kristina Toutanova"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.784",
+    "point2d": [
+      64.67274475097656,
+      -2.954475164413452
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 786,
+    "title": "Dynamic Heterogeneous-Graph Reasoning with Language Models and Knowledge Representation Learning for Commonsense Question Answering",
+    "abstract": "Recently, knowledge graphs (KGs) have won noteworthy success in commonsense question answering. Existing methods retrieve relevant subgraphs in the KGs through key entities and reason about the answer with language models (LMs) and graph neural networks. However, they ignore (i) optimizing the knowledge representation and structure of subgraphs and (ii) deeply fusing heterogeneous QA context with subgraphs. In this paper, we propose a dynamic heterogeneous-graph reasoning method with LMs and knowledge representation learning (DHLK), which constructs a heterogeneous knowledge graph (HKG) based on multiple knowledge sources and optimizes the structure and knowledge representation of the HKG using a two-stage pruning strategy and knowledge representation learning (KRL). It then performs joint reasoning by LMs and Relation Mask Self-Attention (RMSA). Specifically, DHLK filters key entities based on the dictionary vocabulary to achieve the first-stage pruning while incorporating the paraphrases in the dictionary into the subgraph to construct the HKG. Then, DHLK encodes and fuses the QA context and HKG using LM, and dynamically removes irrelevant KG entities based on the attention weights of LM for the second-stage pruning. Finally, DHLK introduces KRL to optimize the knowledge representation and perform answer reasoning on the HKG by RMSA.We evaluate DHLK at CommonsenseQA and OpenBookQA, and show its improvement on existing LM and LM+KG methods.",
+    "authors": [
+      "Yujie Wang",
+      "Hu Zhang",
+      "Jiye Liang",
+      "Ru Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.785",
+    "point2d": [
+      69.70631408691406,
+      -0.07718997448682785
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 787,
+    "title": "Do You Hear The People Sing? Key Point Analysis via Iterative Clustering and Abstractive Summarisation",
+    "abstract": "Argument summarisation is a promising but currently under-explored field. Recent work has aimed to provide textual summaries in the form of concise and salient short texts, i.e., key points (KPs), in a task known as Key Point Analysis (KPA). One of the main challenges in KPA is finding high-quality key point candidates from dozens of arguments even in a small corpus. Furthermore, evaluating key points is crucial in ensuring that the automatically generated summaries are useful. Although automatic methods for evaluating summarisation have considerably advanced over the years, they mainly focus on sentence-level comparison, making it difficult to measure the quality of a summary (a set of KPs) as a whole. Aggravating this problem is the fact that human evaluation is costly and unreproducible. To address the above issues, we propose a two-step abstractive summarisation framework based on neural topic modelling with an iterative clustering procedure, to generate key points which are aligned with how humans identify key points. Our experiments show that our framework advances the state of the art in KPA, with performance improvement of up to 14 (absolute) percentage points, in terms of both ROUGE and our own proposed evaluation metrics. Furthermore, we evaluate the generated summaries using a novel set-based evaluation toolkit. Our quantitative analysis demonstrates the effectiveness of our proposed evaluation metrics in assessing the quality of generated KPs. Human evaluation further demonstrates the advantages of our approach and validates that our proposed evaluation metric is more consistent with human judgment than ROUGE scores.",
+    "authors": [
+      "Hao Li",
+      "Viktor Schlegel",
+      "Riza Batista-Navarro",
+      "Goran Nenadic"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.786",
+    "point2d": [
+      -3.994410276412964,
+      41.27885818481445
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 788,
+    "title": "Ambiguous Learning from Retrieval: Towards Zero-shot Semantic Parsing",
+    "abstract": "Current neural semantic parsers take a supervised approach requiring a considerable amount of training data which is expensive and difficult to obtain. Thus, minimizing the supervision effort is one of the key challenges in semantic parsing. In this paper, we propose the Retrieval as Ambiguous Supervision framework, in which we construct a retrieval system based on pretrained language models to collect high-coverage candidates. Assuming candidates always contain the correct ones, we convert zero-shot task into ambiguously supervised task. To improve the precision and coverage of such ambiguous supervision, we propose a confidence-driven self-training algorithm, in which a semantic parser is learned and exploited to disambiguate the candidates iteratively. Experimental results show that our approach significantly outperforms the state-of-the-art zero-shot semantic parsing methods.",
+    "authors": [
+      "Shan Wu",
+      "Chunlei Xin",
+      "Hongyu Lin",
+      "Xianpei Han",
+      "Cao Liu",
+      "Jiansong Chen",
+      "Fan Yang",
+      "Guanglu Wan",
+      "Le Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.787",
+    "point2d": [
+      -32.49413299560547,
+      -58.35755920410156
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 789,
+    "title": "Explicit Syntactic Guidance for Neural Text Generation",
+    "abstract": "Most existing text generation models follow the sequence-to-sequence paradigm. Generative Grammar suggests that humans generate natural language texts by learning language grammar. We propose a syntax-guided generation schema, which generates the sequence guided by a constituency parse tree in a top-down direction. The decoding process can be decomposed into two parts: (1) predicting the infilling texts for each constituent in the lexicalized syntax context given the source sentence; (2) mapping and expanding each constituent to construct the next-level syntax context. Accordingly, we propose a structural beam search method to find possible syntax structures hierarchically. Experiments on paraphrase generation and machine translation show that the proposed method outperforms autoregressive baselines, while also demonstrating effectiveness in terms of interpretability, controllability, and diversity.",
+    "authors": [
+      "Yafu Li",
+      "Leyang Cui",
+      "Jianhao Yan",
+      "Yongjing Yin",
+      "Wei Bi",
+      "Shuming Shi",
+      "Yue Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.788",
+    "point2d": [
+      -30.542747497558594,
+      14.272903442382812
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 790,
+    "title": "What does a Text Classifier Learn about Morality? An Explainable Method for Cross-Domain Comparison of Moral Rhetoric",
+    "abstract": "Moral rhetoric influences our judgement. Although social scientists recognize moral expression as domain specific, there are no systematic methods for analyzing whether a text classifier learns the domain-specific expression of moral language or not. We propose Tomea, a method to compare a supervised classifier\u2019s representation of moral rhetoric across domains. Tomea enables quantitative and qualitative comparisons of moral rhetoric via an interpretable exploration of similarities and differences across moral concepts and domains. We apply Tomea on moral narratives in thirty-five thousand tweets from seven domains. We extensively evaluate the method via a crowd study, a series of cross-domain moral classification comparisons, and a qualitative analysis of cross-domain moral expression.",
+    "authors": [
+      "Enrico Liscio",
+      "Oscar Araque",
+      "Lorenzo Gatti",
+      "Ionut Constantinescu",
+      "Catholijn Jonker",
+      "Kyriaki Kalimeri",
+      "Pradeep Kumar Murukannaiah"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.789",
+    "point2d": [
+      32.80608367919922,
+      41.88323974609375
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 791,
+    "title": "Graph-based Relation Mining for Context-free Out-of-vocabulary Word Embedding Learning",
+    "abstract": "The out-of-vocabulary (OOV) words are difficult to represent while critical to the performance of embedding-based downstream models. Prior OOV word embedding learning methods failed to model complex word formation well. In this paper, we propose a novel graph-based relation mining method, namely GRM, for OOV word embedding learning. We first build a Word Relationship Graph (WRG) based on word formation and associate OOV words with their semantically relevant words, which can mine the relational information inside word structures. Subsequently, our GRM can infer high-quality embeddings for OOV words through passing and aggregating semantic attributes and relational information in the WRG, regardless of contextual richness. Extensive experiments demonstrate that our model significantly outperforms state-of-the-art baselines on both intrinsic and downstream tasks when faced with OOV words.",
+    "authors": [
+      "Ziran Liang",
+      "Yuyin Lu",
+      "HeGang Chen",
+      "Yanghui Rao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.790",
+    "point2d": [
+      2.0279085636138916,
+      -38.94761276245117
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 792,
+    "title": "Multimodal Persona Based Generation of Comic Dialogs",
+    "abstract": "We focus on the novel problem of persona based dialogue generation for comic strips. Dialogs in comic strips is a unique and unexplored area where every strip contains utterances from various characters with each one building upon the previous utterances and the associated visual scene. Previous works like DialoGPT, PersonaGPT and other dialog generation models encode two-party dialogues and do not account for the visual information. To the best of our knowledge we are the first to propose the paradigm of multimodal persona based dialogue generation. We contribute a novel dataset, ComSet, consisting of 54K strips, harvested from 13 popular comics available online. Further, we propose a multimodal persona-based architecture, MPDialog, to generate dialogues for the next panel in the strip which decreases the perplexity score by ~10 points over strong dialogue generation baseline models. We demonstrate that there is still ample opportunity for improvement, highlighting the importance of building stronger dialogue systems that are able to generate persona-consistent dialogues and understand the context through various modalities.",
+    "authors": [
+      "Harsh Agrawal",
+      "Aditya Mishra",
+      "Manish Gupta",
+      "Mausam -"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.791",
+    "point2d": [
+      11.134284973144531,
+      75.46389770507812
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 793,
+    "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
+    "abstract": "We present LLM-Blender, an ensembling framework designed to attain consistently superior performance by leveraging the diverse strengths of multiple open-source large language models (LLMs). Our framework consists of two modules: PairRanker and GenFuser, addressing the observation that optimal LLMs for different examples can significantly vary. PairRanker employs a specialized pairwise comparison method to distinguish subtle differences between candidate outputs. It jointly encodes the input text and a pair of candidates, using cross-attention encoders to determine the superior one. Our results demonstrate that PairRanker exhibits the highest correlation with ChatGPT-based ranking. Then, GenFuser aims to merge the top-ranked candidates, generating an improved output by capitalizing on their strengths and mitigating their weaknesses. To facilitate large-scale evaluation, we introduce a benchmark dataset, MixInstruct, which is a mixture of multiple instruction datasets featuring oracle pairwise comparisons. Our LLM-Blender significantly outperform individual LLMs and baseline methods across various metrics, establishing a substantial performance gap.",
+    "authors": [
+      "Dongfu Jiang",
+      "Xiang Ren",
+      "Bill Yuchen Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.792",
+    "point2d": [
+      -33.53036880493164,
+      -6.2230377197265625
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 794,
+    "title": "Seen to Unseen: Exploring Compositional Generalization of Multi-Attribute Controllable Dialogue Generation",
+    "abstract": "Existing controllable dialogue generation work focuses on the single-attribute control and lacks generalization capability to out-of-distribution multiple attribute combinations. In this paper, we explore the compositional generalization for multi-attribute controllable dialogue generation where a model can learn from seen attribute values and generalize to unseen combinations. We propose a prompt-based disentangled controllable dialogue generation model, DCG. It learns attribute concept composition by generating attribute-oriented prompt vectors and uses a disentanglement loss to disentangle different attributes for better generalization. Besides, we design a unified reference-free evaluation framework for multiple attributes with different levels of granularities. Experiment results on two benchmarks prove the effectiveness of our method and the evaluation metric.",
+    "authors": [
+      "Weihao Zeng",
+      "Lulu Zhao",
+      "Keqing He",
+      "Ruotong Geng",
+      "Jingang Wang",
+      "Wei Wu",
+      "Weiran Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.793",
+    "point2d": [
+      10.090852737426758,
+      68.84066009521484
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 795,
+    "title": "Generating Structured Pseudo Labels for Noise-resistant Zero-shot Video Sentence Localization",
+    "abstract": "Video sentence localization aims to locate moments in an unstructured video according to a given natural language query. A main challenge is the expensive annotation costs and the annotation bias. In this work, we study video sentence localization in a zero-shot setting, which learns with only video data without any annotation. Existing zero-shot pipelines usually generate event proposals and then generate a pseudo query for each event proposal. However, their event proposals are obtained via visual feature clustering, which is query-independent and inaccurate; and the pseudo-queries are short or less interpretable. Moreover, existing approaches ignores the risk of pseudo-label noise when leveraging them in training. To address the above problems, we propose a Structure-based Pseudo Label generation (SPL), which first generate free-form interpretable pseudo queries before constructing query-dependent event proposals by modeling the event temporal structure. To mitigate the effect of pseudo-label noise, we propose a noise-resistant iterative method that repeatedly re-weight the training sample based on noise estimation to train a grounding model and correct pseudo labels. Experiments on the ActivityNet Captions and Charades-STA datasets demonstrate the advantages of our approach. Code can be found at https://github.com/minghangz/SPL.",
+    "authors": [
+      "Minghang Zheng",
+      "Shaogang Gong",
+      "Hailin Jin",
+      "Yuxin Peng",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.794",
+    "point2d": [
+      -59.1212158203125,
+      60.411800384521484
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 796,
+    "title": "IndicMT Eval: A Dataset to Meta-Evaluate Machine Translation Metrics for Indian Languages",
+    "abstract": "The rapid growth of machine translation (MT) systems necessitates meta-evaluations of evaluation metrics to enable selection of those that best reflect MT quality. Unfortunately, most meta-evaluation studies focus on European languages, the observations for which may not always apply to other languages. Indian languages, having over a billion speakers, are linguistically different from them, and to date, there are no such systematic studies focused solely on English to Indian language MT. This paper fills this gap through a Multidimensional Quality Metric (MQM) dataset consisting of 7000 fine-grained annotations, spanning 5 Indian languages and 7 MT systems. We evaluate 16 metrics and show that, pre-trained metrics like COMET have the highest correlations with annotator scores as opposed to n-gram metrics like BLEU. We further leverage our MQM annotations to develop an Indic-COMET metric and show that it outperforms COMET counterparts in both human scores correlations and robustness scores in Indian languages. Additionally, we show that the Indic-COMET can outperform COMET on some unseen Indian languages. We hope that our dataset and analysis will facilitate further research in Indic MT evaluation.",
+    "authors": [
+      "Ananya Sai B",
+      "Tanay Dixit",
+      "Vignesh Nagarajan",
+      "Anoop Kunchukuttan",
+      "Pratyush Kumar",
+      "Mitesh M. Khapra",
+      "Raj Dabre"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.795",
+    "point2d": [
+      -72.92757415771484,
+      -2.269550085067749
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 797,
+    "title": "Weaker Than You Think: A Critical Look at Weakly Supervised Learning",
+    "abstract": "Weakly supervised learning is a popular approach for training machine learning models in low-resource settings. Instead of requesting high-quality yet costly human annotations, it allows training models with noisy annotations obtained from various weak sources. Recently, many sophisticated approaches have been proposed for robust training under label noise, reporting impressive results. In this paper, we revisit the setup of these approaches and find that the benefits brought by these approaches are significantly overestimated. Specifically, we find that the success of existing weakly supervised learning approaches heavily relies on the availability of clean validation samples which, as we show, can be leveraged much more efficiently by simply training on them. After using these clean labels in training, the advantages of using these sophisticated approaches are mostly wiped out. This remains true even when reducing the size of the available clean data to just five samples per class, making these approaches impractical. To understand the true value of weakly supervised learning, we thoroughly analyze diverse NLP datasets and tasks to ascertain when and why weakly supervised approaches work. Based on our findings, we provide recommendations for future research.",
+    "authors": [
+      "Dawei Zhu",
+      "Xiaoyu Shen",
+      "Marius Mosbach",
+      "Andreas Stephan",
+      "Dietrich Klakow"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.796",
+    "point2d": [
+      -2.894329786300659,
+      -16.303003311157227
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 798,
+    "title": "Prompt Tuning Pushes Farther, Contrastive Learning Pulls Closer: A Two-Stage Approach to Mitigate Social Biases",
+    "abstract": "As the representation capability of Pre-trained Language Models (PLMs) improve, there is growing concern that they will inherit social biases from unprocessed corpora. Most previous debiasing techniques used Counterfactual Data Augmentation (CDA) to balance the training corpus. However, CDA slightly modifies the original corpus, limiting the representation distance between different demographic groups to a narrow range. As a result, the debiasing model easily fits the differences between counterfactual pairs, which affects its debiasing performance with limited text resources. In this paper, we propose an adversarial training-inspired two-stage debiasing model using Contrastive learning with Continuous Prompt Augmentation (named CCPA) to mitigate social biases in PLMs\u2019 encoding. In the first stage, we propose a data augmentation method based on continuous prompt tuning to push farther the representation distance between sample pairs along different demographic groups. In the second stage, we utilize contrastive learning to pull closer the representation distance between the augmented sample pairs and then fine-tune PLMs\u2019 parameters to get debiased encoding. Our approach guides the model to achieve stronger debiasing performance by adding difficulty to the training process. Extensive experiments show that CCPA outperforms baselines in terms of debiasing performance. Meanwhile, experimental results on the GLUE benchmark show that CCPA retains the language modeling capability of PLMs.",
+    "authors": [
+      "Yingji Li",
+      "Mengnan Du",
+      "Xin Wang",
+      "Ying Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.797",
+    "point2d": [
+      12.614609718322754,
+      27.315444946289062
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 799,
+    "title": "Towards Understanding Omission in Dialogue Summarization",
+    "abstract": "Dialogue summarization aims to condense the lengthy dialogue into a concise summary, and has recently achieved significant progress. However, the result of existing methods is still far from satisfactory. Previous works indicated that omission is a major factor in affecting the quality of summarization, but few of them have further explored the omission problem, such as how omission affects summarization results and how to detect omission, which is critical for reducing omission and improving summarization quality. Moreover, analyzing and detecting omission relies on summarization datasets with omission labels (i.e., which dialogue utterances are omitted in the summarization), which are not available in the current literature. In this paper, we propose the OLDS dataset, which provides high-quality omission labels for dialogue summarization. By analyzing this dataset, we find that a large improvement in summarization quality can be achieved by providing ground-truth omission labels for the summarization model to recover omission information, which demonstrates the importance of omission detection for omission mitigation in dialogue summarization. Therefore, we formulate an omission detection task and demonstrate our proposed dataset can support the training and evaluation of this task well. We also call for research action on omission detection based on our proposed datasets. Our dataset and codes are publicly available.",
+    "authors": [
+      "Yicheng Zou",
+      "Kaitao Song",
+      "Xu Tan",
+      "Zhongkai Fu",
+      "Qi Zhang",
+      "Dongsheng Li",
+      "Tao Gui"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.798",
+    "point2d": [
+      -4.181209564208984,
+      52.43238067626953
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 800,
+    "title": "Python Code Generation by Asking Clarification Questions",
+    "abstract": "Code generation from text requires understanding the user\u2019s intent from a natural languagedescription and generating an executable code snippet that satisfies this intent. While recent pretrained language models demonstrate remarkable performance for this task, these models fail when the given natural language description is under-specified. In this work, we introduce a novel and more realistic setup for this task. We hypothesize that the under-specification of a natural language description can be resolved by asking clarification questions. Therefore, we collect and introduce a new dataset named CodeClarQA containing pairs of natural language descriptions and code with created synthetic clarification questions and answers. The empirical results of our evaluation of pretrained language model performance on code generation show that clarifications result in more precisely generated code, as shown by the substantial improvement of model performance in all evaluation metrics. Alongside this, our task and dataset introduce new challenges to the community, including when and what clarification questions should be asked. Our code and dataset are available on GitHub.",
+    "authors": [
+      "Haau-Sing (Xiaocheng) Li",
+      "Mohsen Mesgar",
+      "Andr\u00e9 Martins",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.799",
+    "point2d": [
+      -8.445708274841309,
+      -52.033451080322266
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 801,
+    "title": "A Compare-and-contrast Multistage Pipeline for Uncovering Financial Signals in Financial Reports",
+    "abstract": "In this paper, we address the challenge of discovering financial signals in narrative financial reports. As these documents are often lengthy and tend to blend routine information with new information, it is challenging for professionals to discern critical financial signals. To this end, we leverage the inherent nature of the year-to-year structure of reports to define a novel signal-highlighting task; more importantly, we propose a compare-and-contrast multistage pipeline that recognizes different relationships between the reports and locates relevant rationales for these relationships. We also create and publicly release a human-annotated dataset for our task. Our experiments on the dataset validate the effectiveness of our pipeline, and we provide detailed analyses and ablation studies to support our findings.",
+    "authors": [
+      "Jia-Huei Ju",
+      "Yu-Shiang Huang",
+      "Cheng-Wei Lin",
+      "Che Lin",
+      "Chuan-Ju Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.800",
+    "point2d": [
+      34.87336349487305,
+      5.907059669494629
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 802,
+    "title": "Improving the robustness of NLI models with minimax training",
+    "abstract": "Natural language inference (NLI) models are susceptible to learning shortcuts, i.e. decision rules that spuriously correlate with the label. As a result, they achieve high in-distribution performance, but fail to generalize to out-of-distribution samples where such correlations do not hold. In this paper, we present a training method to reduce the reliance of NLI models on shortcuts and improve their out-of-distribution performance without assuming prior knowledge of the shortcuts being targeted. To this end, we propose a minimax objective between a learner model being trained for the NLI task, and an auxiliary model aiming to maximize the learner\u2019s loss by up-weighting examples from regions of the input space where the learner incurs high losses. This process incentivizes the learner to focus on under-represented \u201chard\u201d examples with patterns that contradict the shortcuts learned from the prevailing \u201ceasy\u201d examples. Experimental results on three NLI datasets demonstrate that our method consistently outperforms other robustness enhancing techniques on out-of-distribution adversarial test sets, while maintaining high in-distribution accuracy.",
+    "authors": [
+      "Michalis Korakakis",
+      "Andreas Vlachos"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.801",
+    "point2d": [
+      2.5664641857147217,
+      3.074305534362793
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 803,
+    "title": "USSA: A Unified Table Filling Scheme for Structured Sentiment Analysis",
+    "abstract": "Most previous studies on Structured Sentiment Analysis (SSA) have cast it as a problem of bi-lexical dependency parsing, which cannot address issues of overlap and discontinuity simultaneously. In this paper, we propose a niche-targeting and effective solution. Our approach involves creating a novel bi-lexical dependency parsing graph, which is then converted to a unified 2D table-filling scheme, namely USSA. The proposed scheme resolves the kernel bottleneck of previous SSA methods by utilizing 13 different types of relations. In addition, to closely collaborate with the USSA scheme, we have developed a model that includes a proposed bi-axial attention module to effectively capture the correlations among relations in the rows and columns of the table. Extensive experimental results on benchmark datasets demonstrate the effectiveness and robustness of our proposed framework, outperforming state-of-the-art methods consistently.",
+    "authors": [
+      "Zepeng Zhai",
+      "Hao Chen",
+      "Ruifan Li",
+      "Xiaojie Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.802",
+    "point2d": [
+      18.76806640625,
+      -35.68673324584961
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 804,
+    "title": "PAD-Net: An Efficient Framework for Dynamic Networks",
+    "abstract": "Dynamic networks, e.g., Dynamic Convolution (DY-Conv) and the Mixture of Experts (MoE), have been extensively explored as they can considerably improve the model\u2019s representation power with acceptable computational cost. The common practice in implementing dynamic networks is to convert the given static layers into fully dynamic ones where all parameters are dynamic (at least within a single layer) and vary with the input. However, such a fully dynamic setting may cause redundant parameters and high deployment costs, limiting the applicability of dynamic networks to a broader range of tasks and models. The main contributions of our work are challenging the basic commonsense in dynamic networks and proposing a partially dynamic network, namely PAD-Net, to transform the redundant dynamic parameters into static ones. Also, we further design Iterative Mode Partition to partition dynamic and static parameters efficiently. Our method is comprehensively supported by large-scale experiments with two typical advanced dynamic architectures, i.e., DY-Conv and MoE, on both image classification and GLUE benchmarks. Encouragingly, we surpass the fully dynamic networks by +0.7\\% top-1 acc with only 30% dynamic parameters for ResNet-50 and +1.9\\% average score in language understanding with only 50% dynamic parameters for BERT. Code will be released at: https://github.com/Shwai-He/PAD-Net.",
+    "authors": [
+      "Shwai He",
+      "Liang Ding",
+      "Daize Dong",
+      "Boan Liu",
+      "Fuqiang Yu",
+      "Dacheng Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.803",
+    "point2d": [
+      -51.99391555786133,
+      31.417282104492188
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 805,
+    "title": "Resolving Ambiguities in Text-to-Image Generative Models",
+    "abstract": "Natural language often contains ambiguities that can lead to misinterpretation and miscommunication. While humans can handle ambiguities effectively by asking clarifying questions and/or relying on contextual cues and common-sense knowledge, resolving ambiguities can be notoriously hard for machines. In this work, we study ambiguities that arise in text-to-image generative models. We curate the Text-to-image Ambiguity Benchmark (TAB) dataset to study different types of ambiguities in text-to-image generative models. We then propose the Text-to-ImagE Disambiguation (TIED) framework to disambiguate the prompts given to the text-to-image generative models by soliciting clarifications from the end user. Through automatic and human evaluations, we show the effectiveness of our framework in generating more faithful images aligned with end user intention in the presence of ambiguities.",
+    "authors": [
+      "Ninareh Mehrabi",
+      "Palash Goyal",
+      "Apurv Verma",
+      "Jwala Dhamala",
+      "Varun Kumar",
+      "Qian Hu",
+      "Kai-Wei Chang",
+      "Richard Zemel",
+      "Aram Galstyan",
+      "Rahul Gupta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.804",
+    "point2d": [
+      -61.77777862548828,
+      46.64888381958008
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 806,
+    "title": "Knowledge Unlearning for Mitigating Privacy Risks in Language Models",
+    "abstract": "Pretrained Language Models (LMs) memorize a vast amount of knowledge during initial pretraining, including information that may violate the privacy of personal lives and identities. Previous work addressing privacy issues for LMs has mostly focused on data preprocessing and differential privacy methods, both requiring re-training the underlying LM. We propose knowledge unlearning as an alternative method to reduce privacy risks for LMs post hoc. We show that simply performing gradient ascent on target token sequences is effective at forgetting them with little to no degradation of general language modeling performances for larger-sized LMs. We also find that sequential unlearning is better than trying to unlearn all the data at once and that unlearning is highly dependent on which kind of data (domain) is forgotten. By showing comparisons with previous methods known to mitigate privacy risks for LMs, we show that our approach can give a stronger empirical privacy guarantee in scenarios where the data vulnerable to extraction attacks are known a priori while being much more efficient and robust.",
+    "authors": [
+      "Joel Jang",
+      "Dongkeun Yoon",
+      "Sohee Yang",
+      "Sungmin Cha",
+      "Moontae Lee",
+      "Lajanugen Logeswaran",
+      "Minjoon Seo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.805",
+    "point2d": [
+      -1.8798444271087646,
+      14.567614555358887
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 807,
+    "title": "Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor",
+    "abstract": "Instruction tuning enables pretrained language models to perform new tasks from inference-time natural language descriptions. These approaches rely on vast amounts of human supervision in the form of crowdsourced datasets or user interactions. In this work, we introduce Unnatural Instructions: a large dataset of creative and diverse instructions, collected with virtually no human labor. We collect 64,000 examples by prompting a language model with three seed examples of instructions and eliciting a fourth. This set is then expanded by prompting the model to rephrase each instruction, creating a total of approximately 240,000 examples of instructions, inputs, and outputs. Experiments show that despite containing a fair amount of noise, training on Unnatural Instructions rivals the effectiveness of training on open-source manually-curated datasets, surpassing the performance of models such as T0++ and Tk-Instruct across various benchmarks. These results demonstrate the potential of model-generated data as a cost-effective alternative to crowdsourcing for dataset expansion and diversification.",
+    "authors": [
+      "Or Honovich",
+      "Thomas Scialom",
+      "Omer Levy",
+      "Timo Schick"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.806",
+    "point2d": [
+      -19.140567779541016,
+      -17.690776824951172
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 808,
+    "title": "To Adapt or to Annotate: Challenges and Interventions for Domain Adaptation in Open-Domain Question Answering",
+    "abstract": "Recent advances in open-domain question answering (ODQA) have demonstrated impressive accuracy on general-purpose domains like Wikipedia. While some work has been investigating how well ODQA models perform when tested for out-of-domain (OOD) generalization, these studies have been conducted only under conservative shifts in data distribution and typically focus on a single component (i.e., retriever or reader) rather than an end-to-end system. This work proposes a more realistic end-to-end domain shift evaluation setting covering five diverse domains. We not only find that end-to-end models fail to generalize but that high retrieval scores often still yield poor answer prediction accuracy. To address these failures, we investigate several interventions, in the form of data augmentations, for improving model adaption and use our evaluation set to elucidate the relationship between the efficacy of an intervention scheme and the particular type of dataset shifts we consider. We propose a generalizability test that estimates the type of shift in a target dataset without training a model in the target domain and that the type of shift is predictive of which data augmentation schemes will be effective for domain adaption. Overall, we find that these interventions increase end-to-end performance by up to ~24 points.",
+    "authors": [
+      "Dheeru Dua",
+      "Emma Strubell",
+      "Sameer Singh",
+      "Pat Verga"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.807",
+    "point2d": [
+      5.786736965179443,
+      -9.4164400100708
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 809,
+    "title": "A Survey for Efficient Open Domain Question Answering",
+    "abstract": "Open domain question answering (ODQA) is a longstanding task aimed at answering factual questions from a large knowledge corpus without any explicit evidence in natural language processing (NLP). Recent works have predominantly focused on improving the answering accuracy and have achieved promising progress. However, higher accuracy often requires more memory consumption and inference latency, which might not necessarily be efficient enough for direct deployment in the real world. Thus, a trade-off between accuracy, memory consumption and processing speed is pursued. In this paper, we will survey recent advancements in the efficiency of ODQA models and conclude core techniques for achieving efficiency. Additionally, we will provide a quantitative analysis of memory cost, query speed, accuracy, and overall performance comparison. Our goal is to keep scholars informed of the latest advancements and open challenges in ODQA efficiency research and contribute to the further development of ODQA efficiency.",
+    "authors": [
+      "Qin Zhang",
+      "Shangsi Chen",
+      "Dongkuan Xu",
+      "Qingqing Cao",
+      "Xiaojun Chen",
+      "Trevor Cohn",
+      "Meng Fang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.808",
+    "point2d": [
+      69.64790344238281,
+      3.6232800483703613
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 810,
+    "title": "Script Normalization for Unconventional Writing of Under-Resourced Languages in Bilingual Communities",
+    "abstract": "The wide accessibility of social media has provided linguistically under-represented communities with an extraordinary opportunity to create content in their native languages. This, however, comes with certain challenges in script normalization, particularly where the speakers of a language in a bilingual community rely on another script or orthography to write their native language. This paper addresses the problem of script normalization for several such languages that are mainly written in a Perso-Arabic script. Using synthetic data with various levels of noise and a transformer-based model, we demonstrate that the problem can be effectively remediated. We conduct a small-scale evaluation of real data as well. Our experiments indicate that script normalization is also beneficial to improve the performance of downstream tasks such as machine translation and language identification.",
+    "authors": [
+      "Sina Ahmadi",
+      "Antonios Anastasopoulos"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.809",
+    "point2d": [
+      -52.699790954589844,
+      -4.700753211975098
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 811,
+    "title": "Compositional Generalization without Trees using Multiset Tagging and Latent Permutations",
+    "abstract": "Seq2seq models have been shown to struggle with compositional generalization in semantic parsing, i.e. generalizing to unseen compositions of phenomena that the model handles correctly in isolation.We phrase semantic parsing as a two-step process: we first tag each input token with a multiset of output tokens. Then we arrange the tokens into an output sequence using a new way of parameterizing and predicting permutations. We formulate predicting a permutation as solving a regularized linear program and we backpropagate through the solver. In contrast to prior work, our approach does not place a priori restrictions on possible permutations, making it very expressive.Our model outperforms pretrained seq2seq models and prior work on realistic semantic parsing tasks that require generalization to longer examples. We also outperform non-tree-based models on structural generalization on the COGS benchmark.For the first time, we show that a model without an inductive bias provided by trees achieves high accuracy on generalization to deeper recursion depth.",
+    "authors": [
+      "Matthias Lindemann",
+      "Alexander Koller",
+      "Ivan Titov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.810",
+    "point2d": [
+      -25.1146297454834,
+      -56.51647186279297
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 812,
+    "title": "ManagerTower: Aggregating the Insights of Uni-Modal Experts for Vision-Language Representation Learning",
+    "abstract": "Two-Tower Vision-Language (VL) models have shown promising improvements on various downstream VL tasks. Although the most advanced work improves performance by building bridges between encoders, it suffers from ineffective layer-by-layer utilization of uni-modal representations and cannot flexibly exploit different levels of uni-modal semantic knowledge. In this work, we propose ManagerTower, a novel VL model architecture that gathers and combines the insights of pre-trained uni-modal experts at different levels. The managers introduced in each cross-modal layer can adaptively aggregate uni-modal semantic knowledge to facilitate more comprehensive cross-modal alignment and fusion. ManagerTower outperforms previous strong baselines both with and without Vision-Language Pre-training (VLP). With only 4M VLP data, ManagerTower achieves superior performances on various downstream VL tasks, especially 79.15% accuracy on VQAv2 Test-Std, 86.56% IR@1 and 95.64% TR@1 on Flickr30K. Code and checkpoints are available at https://github.com/LooperXX/ManagerTower.",
+    "authors": [
+      "Xiao Xu",
+      "Bei Li",
+      "Chenfei Wu",
+      "Shao-Yen Tseng",
+      "Anahita Bhiwandiwalla",
+      "Shachar Rosenman",
+      "Vasudev Lal",
+      "Wanxiang Che",
+      "Nan Duan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.811",
+    "point2d": [
+      -56.64636993408203,
+      33.640838623046875
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 813,
+    "title": "Finding the Pillars of Strength for Multi-Head Attention",
+    "abstract": "Recent studies have revealed some issues of Multi-Head Attention (MHA), e.g., redundancy and over-parameterization. Specifically, the heads of MHA were originally designed to attend to information from different representation subspaces, whereas prior studies found that some attention heads likely learn similar features and can be pruned without harming performance. Inspired by the minimum-redundancy feature selection, we assume that focusing on the most representative and distinctive features with minimum resources can mitigate the above issues and lead to more effective and efficient MHAs. In particular, we propose Grouped Head Attention, trained with a self-supervised group constraint that group attention heads, where each group focuses on an essential but distinctive feature subset. We additionally propose a Voting-to-Stay procedure to remove redundant heads, thus achieving a transformer with lighter weights. Extensive experiments are consistent with our hypothesis. Moreover, our method achieves significant performance gains on three well-established tasks while considerably compressing parameters.",
+    "authors": [
+      "Jinjie Ni",
+      "Rui Mao",
+      "Zonglin Yang",
+      "Han Lei",
+      "Erik Cambria"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.812",
+    "point2d": [
+      -56.5402717590332,
+      26.33613395690918
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 814,
+    "title": "Jointprop: Joint Semi-supervised Learning for Entity and Relation Extraction with Heterogeneous Graph-based Propagation",
+    "abstract": "Semi-supervised learning has been an important approach to address challenges in extracting entities and relations from limited data. However, current semi-supervised works handle the two tasks (i.e., Named Entity Recognition and Relation Extraction) separately and ignore the cross-correlation of entity and relation instances as well as the existence of similar instances across unlabeled data. To alleviate the issues, we propose Jointprop, a Heterogeneous Graph-based Propagation framework for joint semi-supervised entity and relation extraction, which captures the global structure information between individual tasks and exploits interactions within unlabeled data. Specifically, we construct a unified span-based heterogeneous graph from entity and relation candidates and propagate class labels based on confidence scores. We then employ a propagation learning scheme to leverage the affinities between labelled and unlabeled samples.Experiments on benchmark datasets show that our framework outperforms the state-of-the-art semi-supervised approaches on NER and RE tasks. We show that the joint semi-supervised learning of the two tasks benefits from their codependency and validates the importance of utilizing the shared information between unlabeled data.",
+    "authors": [
+      "Yandan Zheng",
+      "Anran Hao",
+      "Anh Tuan Luu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.813",
+    "point2d": [
+      40.71971893310547,
+      -68.01055908203125
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 815,
+    "title": "Reasoning over Hierarchical Question Decomposition Tree for Explainable Question Answering",
+    "abstract": "Explainable question answering (XQA) aims to answer a given question and provide an explanation why the answer is selected. Existing XQA methods focus on reasoning on a single knowledge source, e.g., structured knowledge bases, unstructured corpora, etc. However, integrating information from heterogeneous knowledge sources is essential to answer complex questions. In this paper, we propose to leverage question decomposing for heterogeneous knowledge integration, by breaking down a complex question into simpler ones, and selecting the appropriate knowledge source for each sub-question. To facilitate reasoning, we propose a novel two-stage XQA framework, Reasoning over Hierarchical Question Decomposition Tree (RoHT). First, we build the Hierarchical Question Decomposition Tree (HQDT) to understand the semantics of a complex question; then, we conduct probabilistic reasoning over HQDT from root to leaves recursively, to aggregate heterogeneous knowledge at different tree levels and search for a best solution considering the decomposing and answering probabilities. The experiments on complex QA datasets KQA Pro and Musique show that our framework outperforms SOTA methods significantly, demonstrating the effectiveness of leveraging question decomposing for knowledge integration and our RoHT framework.",
+    "authors": [
+      "Jiajie Zhang",
+      "Shulin Cao",
+      "Tingjian Zhang",
+      "Xin Lv",
+      "Juanzi Li",
+      "Lei Hou",
+      "Jiaxin Shi",
+      "Qi Tian"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.814",
+    "point2d": [
+      70.72081756591797,
+      1.4966119527816772
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 816,
+    "title": "Faking Fake News for Real Fake News Detection: Propaganda-Loaded Training Data Generation",
+    "abstract": "Despite recent advances in detecting fake news generated by neural models, their results are not readily applicable to effective detection of human-written disinformation. What limits the successful transfer between them is the sizable gap between machine-generated fake news and human-authored ones, including the notable differences in terms of style and underlying intent. With this in mind, we propose a novel framework for generating training examples that are informed by the known styles and strategies of human-authored propaganda. Specifically, we perform self-critical sequence training guided by natural language inference to ensure the validity of the generated articles, while also incorporating propaganda techniques, such as appeal to authority and loaded language. In particular, we create a new training dataset, PropaNews, with 2,256 examples, which we release for future use. Our experimental results show that fake news detectors trained on PropaNews are better at detecting human-written disinformation by 3.62\u20137.69% F1 score on two public datasets.",
+    "authors": [
+      "Kung-Hsiang Huang",
+      "Kathleen McKeown",
+      "Preslav Nakov",
+      "Yejin Choi",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.815",
+    "point2d": [
+      34.34825134277344,
+      19.262189865112305
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 817,
+    "title": "A Length-Extrapolatable Transformer",
+    "abstract": "Position modeling plays a critical role in Transformers. In this paper, we focus on length extrapolation, i.e., training on short texts while evaluating longer sequences. We define attention resolution as an indicator of extrapolation. Then we propose two designs to improve the above metric of Transformers. Specifically, we introduce a relative position embedding to explicitly maximize attention resolution. Moreover, we use blockwise causal attention during inference for better resolution. We evaluate different Transformer variants with language modeling. Experimental results show that our model achieves strong performance in both interpolation and extrapolation settings. The code will be available at https://aka.ms/LeX-Transformer.",
+    "authors": [
+      "Yutao Sun",
+      "Li Dong",
+      "Barun Patra",
+      "Shuming Ma",
+      "Shaohan Huang",
+      "Alon Benhaim",
+      "Vishrav Chaudhary",
+      "Xia Song",
+      "Furu Wei"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.816",
+    "point2d": [
+      -38.282188415527344,
+      -29.066394805908203
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 818,
+    "title": "A Survey of Deep Learning for Mathematical Reasoning",
+    "abstract": "Mathematical reasoning is a fundamental aspect of human intelligence and is applicable in various fields, including science, engineering, finance, and everyday life. The development of artificial intelligence (AI) systems capable of solving math problems and proving theorems in language has garnered significant interest in the fields of machine learning and natural language processing. For example, mathematics serves as a testbed for aspects of reasoning that are challenging for powerful deep learning models, driving new algorithmic and modeling advances. On the other hand, recent advances in large-scale neural language models have opened up new benchmarks and opportunities to use deep learning for mathematical reasoning. In this survey paper, we review the key tasks, datasets, and methods at the intersection of mathematical reasoning and deep learning over the past decade. We also evaluate existing benchmarks and methods, and discuss future research directions in this domain.",
+    "authors": [
+      "Pan Lu",
+      "Liang Qiu",
+      "Wenhao Yu",
+      "Sean Welleck",
+      "Kai-Wei Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.817",
+    "point2d": [
+      45.782745361328125,
+      -22.627197265625
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 819,
+    "title": "A Systematic Study of Knowledge Distillation for Natural Language Generation with Pseudo-Target Training",
+    "abstract": "Modern Natural Language Generation (NLG) models come with massive computational and storage requirements. In this work, we study the potential of compressing them, which is crucial for real-world applications serving millions of users. We focus on Knowledge Distillation (KD) techniques, in which a small student model learns to imitate a large teacher model, allowing to transfer knowledge from the teacher to the student. In contrast to much of the previous work, our goal is to optimize the model for a specific NLG task and a specific dataset. Typically in real-world applications, in addition to labeled data there is abundant unlabeled task-specific data, which is crucial for attaining high compression rates via KD. In this work, we conduct a systematic study of task-specific KD techniques for various NLG tasks under realistic assumptions. We discuss the special characteristics of NLG distillation and particularly the exposure bias problem. Following, we derive a family of Pseudo-Target (PT) augmentation methods, substantially extending prior work on sequence-level KD. We propose the Joint-Teaching method, which applies word-level KD to multiple PTs generated by both the teacher and the student.Finally, we validate our findings in an extreme setup with no labeled examples using GPT-4 as the teacher. Our study provides practical model design observations and demonstrates the effectiveness of PT training for task-specific KD in NLG.",
+    "authors": [
+      "Nitay Calderon",
+      "Subhabrata Mukherjee",
+      "Roi Reichart",
+      "Amir Kantor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.818",
+    "point2d": [
+      -50.759620666503906,
+      -20.8984317779541
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 820,
+    "title": "Vision Language Pre-training by Contrastive Learning with Cross-Modal Similarity Regulation",
+    "abstract": "In this paper, we reconsider the problem of (partial) false negative samples from the Mutual Information (MI) Maximization perspective, the traditional contrastive loss (like InfoNCE loss) will equally push away the anchor of all positive samples and negative samples regardless of their possible semantic similarities. We theoretically show that InfoNCE loss will not only maximize the MI between the anchor and positive samples but minimize the MI between the anchor and false negative samples even though they share similar semantic which could provide a possible theoretical explanation for the observation of the existence of false negative samples in the cross-modal contrastive learning will decrease the downstream task performance of VLP models. Above analysis motivate us to propose the VLP model with a novel Semantic Awared Contrastive Learning framework named SACL where different negative samples are assigned with different contrastive weights according to the semantic similarity between them and the anchor.",
+    "authors": [
+      "Chaoya Jiang",
+      "Wei Ye",
+      "Haiyang Xu",
+      "Songfang Huang",
+      "Fei Huang",
+      "Shikun Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.819",
+    "point2d": [
+      -55.30802536010742,
+      32.09221649169922
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 821,
+    "title": "Tell2Design: A Dataset for Language-Guided Floor Plan Generation",
+    "abstract": "We consider the task of generating designs directly from natural language descriptions, and consider floor plan generation as the initial research area. Language conditional generative models have recently been very successful in generating high-quality artistic images. However, designs must satisfy different constraints that are not present in generating artistic images, particularly spatial and relational constraints.We make multiple contributions to initiate research on this task. First, we introduce a novel dataset, Tell2Design (T2D), which contains more than 80k floor plan designs associated with natural language instructions. Second, we propose a Sequence-to-Sequence model that can serve as a strong baseline for future research. Third, we benchmark this task with several text-conditional image generation models. We conclude by conducting human evaluations on the generated samples and providing an analysis of human performance.We hope our contributions will propel the research on language-guided design generation forward.",
+    "authors": [
+      "Sicong Leng",
+      "Yang Zhou",
+      "Mohammed Haroon Dupty",
+      "Wee Sun Lee",
+      "Sam Joyce",
+      "Wei Lu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.820",
+    "point2d": [
+      -61.972412109375,
+      50.91871643066406
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 822,
+    "title": "Are Human Explanations Always Helpful? Towards Objective Evaluation of Human Natural Language Explanations",
+    "abstract": "Human-annotated labels and explanations are critical for training explainable NLP models. However, unlike human-annotated labels whose quality is easier to calibrate (e.g., with a majority vote), human-crafted free-form explanations can be quite subjective. Before blindly using them as ground truth to train ML models, a vital question needs to be asked: How do we evaluate a human-annotated explanation\u2019s quality? In this paper, we build on the view that the quality of a human-annotated explanation can be measured based on its helpfulness (or impairment) to the ML models\u2019 performance for the desired NLP tasks for which the annotations were collected. In comparison to the commonly used Simulatability score, we define a new metric that can take into consideration the helpfulness of an explanation for model performance at both fine-tuning and inference. With the help of a unified dataset format, we evaluated the proposed metric on five datasets (e.g., e-SNLI) against two model architectures (T5 and BART), and the results show that our proposed metric can objectively evaluate the quality of human-annotated explanations, while Simulatability falls short.",
+    "authors": [
+      "Bingsheng Yao",
+      "Prithviraj Sen",
+      "Lucian Popa",
+      "James Hendler",
+      "Dakuo Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.821",
+    "point2d": [
+      29.690418243408203,
+      -9.715987205505371
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 823,
+    "title": "Rethinking Annotation: Can Language Learners Contribute?",
+    "abstract": "Researchers have traditionally recruited native speakers to provide annotations for the widely used benchmark datasets. But there are languages for which recruiting native speakers is difficult, and it would help to get learners of those languages to annotate the data. In this paper, we investigate whether language learners can contribute annotations to the benchmark datasets. In a carefully controlled annotation experiment, we recruit 36 language learners, provide two types of additional resources (dictionaries and machine-translated sentences), and perform mini-tests to measure their language proficiency. We target three languages, English, Korean, and Indonesian, and four NLP tasks, sentiment analysis, natural language inference, named entity recognition, and machine reading comprehension. We find that language learners, especially those with intermediate or advanced language proficiency, are able to provide fairly accurate labels with the help of additional resources. Moreover, we show that data annotation improves learners\u2019 language proficiency in terms of vocabulary and grammar. The implication of our findings is that broadening the annotation task to include language learners can open up the opportunity to build benchmark datasets for languages for which it is difficult to recruit native speakers.",
+    "authors": [
+      "Haneul Yoo",
+      "Rifki Afina Putri",
+      "Changyoon Lee",
+      "Youngin Lee",
+      "So-Yeon Ahn",
+      "Dongyeop Kang",
+      "Alice Oh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.822",
+    "point2d": [
+      -24.46588134765625,
+      -33.928043365478516
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 824,
+    "title": "Information Screening whilst Exploiting! Multimodal Relation Extraction with Feature Denoising and Multimodal Topic Modeling",
+    "abstract": "Existing research on multimodal relation extraction (MRE) faces two co-existing challenges, internal-information over-utilization and external-information under-exploitation. To combat that, we propose a novel framework that simultaneously implements the idea of internal-information screening and external-information exploiting. First, we represent the fine-grained semantic structures of the input image and text with the visual and textual scene graphs, which are further fused into a unified cross-modal graph (CMG). Based on CMG, we perform structure refinement with the guidance of the graph information bottleneck principle, actively denoising the less-informative features. Next, we perform topic modeling over the input image and text, incorporating latent multimodal topic features to enrich the contexts. On the benchmark MRE dataset, our system outperforms the current best model significantly. With further in-depth analyses, we reveal the great potential of our method for the MRE task.",
+    "authors": [
+      "Shengqiong Wu",
+      "Hao Fei",
+      "Yixin Cao",
+      "Lidong Bing",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.823",
+    "point2d": [
+      -48.581642150878906,
+      41.669219970703125
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 825,
+    "title": "MultiEMO: An Attention-Based Correlation-Aware Multimodal Fusion Framework for Emotion Recognition in Conversations",
+    "abstract": "Emotion Recognition in Conversations (ERC) is an increasingly popular task in the Natural Language Processing community, which seeks to achieve accurate emotion classifications of utterances expressed by speakers during a conversation. Most existing approaches focus on modeling speaker and contextual information based on the textual modality, while the complementarity of multimodal information has not been well leveraged, few current methods have sufficiently captured the complex correlations and mapping relationships across different modalities. Furthermore, existing state-of-the-art ERC models have difficulty classifying minority and semantically similar emotion categories. To address these challenges, we propose a novel attention-based correlation-aware multimodal fusion framework named MultiEMO, which effectively integrates multimodal cues by capturing cross-modal mapping relationships across textual, audio and visual modalities based on bidirectional multi-head cross-attention layers. The difficulty of recognizing minority and semantically hard-to-distinguish emotion classes is alleviated by our proposed Sample-Weighted Focal Contrastive (SWFC) loss. Extensive experiments on two benchmark ERC datasets demonstrate that our MultiEMO framework consistently outperforms existing state-of-the-art approaches in all emotion categories on both datasets, the improvements in minority and semantically similar emotions are especially significant.",
+    "authors": [
+      "Tao Shi",
+      "Shao-Lun Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.824",
+    "point2d": [
+      -37.56694030761719,
+      63.52583694458008
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 826,
+    "title": "Learning Language-Specific Layers for Multilingual Machine Translation",
+    "abstract": "Multilingual Machine Translation promises to improve translation quality between non-English languages. This is advantageous for several reasons, namely lower latency (no need to translate twice), and reduced error cascades (e.g., avoiding losing gender and formality information when translating through English).On the downside, adding more languages reduces model capacity per language, which is usually countered by increasing the overall model size, making training harder and inference slower.In this work, we introduce Language-Specific Transformer Layers (LSLs), which allow us to increase model capacity, while keeping the amount of computation and the number of parameters used in the forward pass constant. The key idea is to have some layers of the encoder be source or target language-specific, while keeping the remaining layers shared. We study the best way to place these layers using a neural architecture search inspired approach, and achieve an improvement of 1.3 chrF (1.5 spBLEU) points over not using LSLs on a separate decoder architecture, and 1.9 chrF (2.2 spBLEU) on a shared decoder one.",
+    "authors": [
+      "Telmo Pires",
+      "Robin Schmidt",
+      "Yi-Hsiu Liao",
+      "Stephan Peitz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.825",
+    "point2d": [
+      -59.37373352050781,
+      -10.082304954528809
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 827,
+    "title": "Personality Understanding of Fictional Characters during Book Reading",
+    "abstract": "Comprehending characters\u2019 personalities is a crucial aspect of story reading. As readers engage with a story, their understanding of a character evolves based on new events and information; and multiple fine-grained aspects of personalities can be perceived. This leads to a natural problem of situated and fine-grained personality understanding. The problem has not been studied in the NLP field, primarily due to the lack of appropriate datasets mimicking the process of book reading. We present the first labeled dataset PersoNet for this problem. Our novel annotation strategy involves annotating user notes from online reading apps as a proxy for the original books. Experiments and human studies indicate that our dataset construction is both efficient and accurate; and our task heavily relies on long-term context to achieve accurate predictions for both machines and humans.",
+    "authors": [
+      "Mo Yu",
+      "Jiangnan Li",
+      "Shunyu Yao",
+      "Wenjie Pang",
+      "Xiaochen Zhou",
+      "Zhou Xiao",
+      "Fandong Meng",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.826",
+    "point2d": [
+      8.325057029724121,
+      49.14848709106445
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 828,
+    "title": "StoryTrans: Non-Parallel Story Author-Style Transfer with Discourse Representations and Content Enhancing",
+    "abstract": "Non-parallel text style transfer is an important task in natural language generation. However, previous studies concentrate on the token or sentence level, such as sentence sentiment and formality transfer, but neglect long style transfer at the discourse level. Long texts usually involve more complicated author linguistic preferences such as discourse structures than sentences. In this paper, we formulate the task of non-parallel story author-style transfer, which requires transferring an input story into a specified author style while maintaining source semantics. To tackle this problem, we propose a generation model, named StoryTrans, which leverages discourse representations to capture source content information and transfer them to target styles with learnable style embeddings. We use an additional training objective to disentangle stylistic features from the learned discourse representation to prevent the model from degenerating to an auto-encoder. Moreover, to enhance content preservation, we design a mask-and-fill framework to explicitly fuse style-specific keywords of source texts into generation. Furthermore, we constructed new datasets for this task in Chinese and English, respectively. Extensive experiments show that our model outperforms strong baselines in overall performance of style transfer and content preservation.",
+    "authors": [
+      "Xuekai Zhu",
+      "Jian Guan",
+      "Minlie Huang",
+      "Juan Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.827",
+    "point2d": [
+      -23.049352645874023,
+      43.51984786987305
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 829,
+    "title": "Towards Benchmarking and Improving the Temporal Reasoning Capability of Large Language Models",
+    "abstract": "Reasoning about time is of fundamental importance. Many facts are time-dependent. For example, athletes change teams from time to time, and different government officials are elected periodically. Previous time-dependent question answering (QA) datasets tend to be biased in either their coverage of time spans or question types. In this paper, we introduce a comprehensive probing dataset TempReason to evaluate the temporal reasoning capability of large language models. Our dataset includes questions of three temporal reasoning levels. In addition, we also propose a novel learning framework to improve the temporal reasoning capability of large language models, based on temporal span extraction and time-sensitive reinforcement learning. We conducted experiments in closed book QA, open book QA, and reasoning QA settings and demonstrated the effectiveness of our approach.",
+    "authors": [
+      "Qingyu Tan",
+      "Hwee Tou Ng",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.828",
+    "point2d": [
+      74.34767150878906,
+      0.30039408802986145
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 830,
+    "title": "Finding the SWEET Spot: Analysis and Improvement of Adaptive Inference in Low Resource Settings",
+    "abstract": "Adaptive inference is a simple method for reducing inference costs. The method works by maintaining multiple classifiers of different capacities, and allocating resources to each test instance according to its difficulty. In this work, we compare the two main approaches for adaptive inference, Early-Exit and Multi-Model, when training data is limited. First, we observe that for models with the same architecture and size, individual Multi-Model classifiers outperform their Early-Exit counterparts by an average of 2.3%. We show that this gap is caused by Early-Exit classifiers sharing model parameters during training, resulting in conflicting gradient updates of model weights. We find that despite this gap, Early-Exit still provides a better speed-accuracy trade-off due to the overhead of the Multi-Model approach. To address these issues, we propose SWEET (Separating Weights for Early-Exit Transformers) an Early-Exit fine-tuning method that assigns each classifier its own set of unique model weights, not updated by other classifiers. We compare SWEET\u2019s speed-accuracy curve to standard Early-Exit and Multi-Model baselines and find that it outperforms both methods at fast speeds while maintaining comparable scores to Early- Exit at slow speeds. Moreover, SWEET individual classifiers outperform Early-Exit ones by 1.1% on average. SWEET enjoys the benefits of both methods, paving the way for further reduction of inference costs in NLP.",
+    "authors": [
+      "Daniel Rotem",
+      "Michael Hassid",
+      "Jonathan Mamou",
+      "Roy Schwartz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.829",
+    "point2d": [
+      -41.574100494384766,
+      -19.59918212890625
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 831,
+    "title": "Large Language Models Are Reasoning Teachers",
+    "abstract": "Recent works have shown that chain-of-thought (CoT) prompting can elicit language models to solve complex reasoning tasks, step-by-step. However, prompt-based CoT methods are dependent on very large models such as GPT-3 175B which are prohibitive to deploy at scale. In this paper, we use these large models as reasoning teachers to enable complex reasoning in smaller models and reduce model size requirements by several orders of magnitude. We propose Fine-tune-CoT, a method that generates reasoning samples from very large teacher models to fine-tune smaller models. We evaluate our method on a wide range of public models and complex tasks. We find that Fine-tune-CoT enables substantial reasoning capability in small models, far outperforming prompt-based baselines and even the teacher model in many tasks. Additionally, we extend our method by leveraging the teacher model\u2019s ability to generate multiple distinct rationales for each original sample. Enriching the fine-tuning data with such diverse reasoning results in a substantial performance boost across datasets, even for very small models. We conduct ablations and sample studies to understand the emergence of reasoning capabilities of student models. Our code implementation and data are available at https://github.com/itsnamgyu/reasoning-teacher.",
+    "authors": [
+      "Namgyu Ho",
+      "Laura Schmid",
+      "Se-Young Yun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.830",
+    "point2d": [
+      44.218170166015625,
+      -15.710226058959961
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 832,
+    "title": "Abductive Commonsense Reasoning Exploiting Mutually Exclusive Explanations",
+    "abstract": "Abductive reasoning aims to find plausible explanations for an event. This style of reasoning is critical for commonsense tasks where there are often multiple plausible explanations. Existing approaches for abductive reasoning in natural language processing (NLP) often rely on manually generated annotations for supervision; however, such annotations can be subjective and biased. Instead of using direct supervision, this work proposes an approach for abductive commonsense reasoning that exploits the fact that only a subset of explanations is correct for a given context. The method uses posterior regularization to enforce a mutual exclusion constraint, encouraging the model to learn the distinction between fluent explanations and plausible ones. We evaluate our approach on a diverse set of abductive reasoning datasets; experimental results show that our approach outperforms or is comparable to directly applying pretrained language models in a zero-shot manner and other knowledge-augmented zero-shot methods.",
+    "authors": [
+      "Wenting Zhao",
+      "Justin Chiu",
+      "Claire Cardie",
+      "Alexander Rush"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.831",
+    "point2d": [
+      56.1689338684082,
+      -17.54045867919922
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 833,
+    "title": "PESCO: Prompt-enhanced Self Contrastive Learning for Zero-shot Text Classification",
+    "abstract": "We present PESCO, a novel contrastive learning framework that substantially improves the performance of zero-shot text classification. We formulate text classification as a neural text retrieval problem where each document is treated as a query, and the system learns the mapping from each query to the relevant class labels by (1) adding prompts to enhance label retrieval, and (2) using retrieved labels to enrich the training set in a self-training loop of contrastive learning. PESCO achieves state-of-the-art performance on four benchmark text classification datasets. On DBpedia, we achieve 98.5% accuracy without any labeled data, which is close to the fully-supervised result. Extensive experiments and analyses show all the components of PESCO are necessary for improving the performance of zero-shot text classification.",
+    "authors": [
+      "Yau-Shian Wang",
+      "Ta-Chung Chi",
+      "Ruohong Zhang",
+      "Yiming Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.832",
+    "point2d": [
+      2.5937299728393555,
+      -26.7945613861084
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 834,
+    "title": "Visually-augmented pretrained language models for NLP tasks without images",
+    "abstract": "Although pre-trained language models (PLMs) have shown impressive performance by text-only self-supervised training, they are found lack of visual semantics or commonsense. Existing solutions often rely on explicit images for visual knowledge augmentation (requiring time-consuming retrieval or generation), and they also conduct the augmentation for the whole input text, without considering whether it is actually needed in specific inputs or tasks. To address these issues, we propose a novel **V**isually-**A**ugmented fine-tuning approach that can be generally applied to various PLMs or NLP tasks, **W**ithout using any retrieved or generated **I**mages, namely **VAWI**. Experimental results show that our approach can consistently improve the performance of BERT, RoBERTa, BART, and T5 at different scales, and outperform several competitive baselines on ten tasks. Our codes and data are publicly available at https://github.com/RUCAIBox/VAWI.",
+    "authors": [
+      "Hangyu Guo",
+      "Kun Zhou",
+      "Wayne Xin Zhao",
+      "Qinyu Zhang",
+      "Ji-Rong Wen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.833",
+    "point2d": [
+      -54.197532653808594,
+      36.893211364746094
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 835,
+    "title": "Using counterfactual contrast to improve compositional generalization for multi-step quantitative reasoning",
+    "abstract": "In quantitative question answering, compositional generalization is one of the main challenges of state of the art models, especially when longer sequences of reasoning steps are required. In this paper we propose CounterComp, a method that uses counterfactual scenarios to generate samples with compositional contrast. Instead of a data augmentation approach, CounterComp is based on metric learning, which allows for direct sampling from the training set and circumvents the need for additional human labels. Our proposed auxiliary metric learning loss improves the performance of three state of the art models on four recently released datasets. We also show how the approach can improve OOD performance on unseen domains, as well as unseen compositions. Lastly, we demonstrate how the method can lead to better compositional attention patterns during training.",
+    "authors": [
+      "Armineh Nourbakhsh",
+      "Sameena Shah",
+      "Carolyn Ros\u00e9"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.834",
+    "point2d": [
+      62.142486572265625,
+      2.7257721424102783
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 836,
+    "title": "A Needle in a Haystack: An Analysis of High-Agreement Workers on MTurk for Summarization",
+    "abstract": "To prevent the costly and inefficient use of resources on low-quality annotations, we want a method for creating a pool of dependable annotators who can effectively complete difficult tasks, such as evaluating automatic summarization. Thus, we investigate the recruitment of high-quality Amazon Mechanical Turk workers via a two-step pipeline. We show that we can successfully filter out subpar workers before they carry out the evaluations and obtain high-agreement annotations with similar constraints on resources. Although our workers demonstrate a strong consensus among themselves and CloudResearch workers, their alignment with expert judgments on a subset of the data is not as expected and needs further training in correctness. This paper still serves as a best practice for the recruitment of qualified annotators in other challenging annotation tasks.",
+    "authors": [
+      "Lining Zhang",
+      "Simon Mille",
+      "Yufang Hou",
+      "Daniel Deutsch",
+      "Elizabeth Clark",
+      "Yixin Liu",
+      "Saad Mahamood",
+      "Sebastian Gehrmann",
+      "Miruna Clinciu",
+      "Khyathi Raghavi Chandu",
+      "Jo\u00e3o Sedoc"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.835",
+    "point2d": [
+      -9.261098861694336,
+      37.51304244995117
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 837,
+    "title": "TAVT: Towards Transferable Audio-Visual Text Generation",
+    "abstract": "Audio-visual text generation aims to understand multi-modality contents and translate them into texts. Although various transfer learning techniques of text generation have been proposed, they focused on uni-modal analysis (e.g. text-to-text, visual-to-text) and lack consideration of multi-modal content and cross-modal relation. Motivated by the fact that humans can recognize the timbre of the same low-level concepts (e.g., footstep, rainfall, and laughing), even in different visual conditions, we aim to mitigate the domain discrepancies by audio-visual correlation.In this paper, we propose a novel Transferable Audio-Visual Text Generation framework, named TAVT, which consists of two key components: Audio-Visual Meta-Mapper (AVMM) and Dual Counterfactual Contrastive Learning (DCCL). (1) AVMM first introduces a universal auditory semantic space and drifts the domain-invariant low-level concepts into visual prefixes. Then the reconstruct-based learning encourages the AVMM to learn \u201cwhich pixels belong to the same sound\u201d and achieve audio-enhanced visual prefix. The well-trained AVMM can be further applied to uni-modal setting. (2) Furthermore, DCCL leverages the destructive counterfactual transformations to provide cross-modal constraints for AVMM from the perspective of feature distribution and text generation. (3) The experimental results show that TAVT outperforms the state-of-the-art methods across multiple domains (cross-datasets, cross-categories) and various modal settings (uni-modal, multi-modal).",
+    "authors": [
+      "Wang Lin",
+      "Tao Jin",
+      "Wenwen Pan",
+      "Linjun Li",
+      "Xize Cheng",
+      "Ye Wang",
+      "Zhou Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.836",
+    "point2d": [
+      -65.39702606201172,
+      49.24152755737305
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 838,
+    "title": "MeetingQA: Extractive Question-Answering on Meeting Transcripts",
+    "abstract": "With the ubiquitous use of online meeting platforms and robust automatic speech recognition systems, meeting transcripts have emerged as a promising domain for natural language tasks. Most recent works on meeting transcripts primarily focus on summarization and extraction of action items. However, meeting discussions also have a useful question-answering (QA) component, crucial to understanding the discourse or meeting content, and can be used to build interactive interfaces on top of long transcripts. Hence, in this work, we leverage this inherent QA component of meeting discussions and introduce MeetingQA, an extractive QA dataset comprising of questions asked by meeting participants and corresponding responses. As a result, questions can be open-ended and actively seek discussions, while the answers can be multi-span and distributed across multiple speakers. Our comprehensive empirical study of several robust baselines including long-context language models and recent instruction-tuned models reveals that models perform poorly on this task (F1 = 57.3) and severely lag behind human performance (F1 = 84.6), thus presenting a challenging new task for the community to improve upon.",
+    "authors": [
+      "Archiki Prasad",
+      "Trung Bui",
+      "Seunghyun Yoon",
+      "Hanieh Deilamsalehy",
+      "Franck Dernoncourt",
+      "Mohit Bansal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.837",
+    "point2d": [
+      15.805720329284668,
+      51.61216354370117
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 839,
+    "title": "FERMAT: An Alternative to Accuracy for Numerical Reasoning",
+    "abstract": "While pre-trained language models achieve impressive performance on various NLP benchmarks, they still struggle with tasks that require numerical reasoning. Recent advances in improving numerical reasoning are mostly achieved using very large language models that contain billions of parameters and are not accessible to everyone. In addition, numerical reasoning is measured using a single score on existing datasets. As a result, we do not have a clear understanding of the strengths and shortcomings of existing models on different numerical reasoning aspects and therefore, potential ways to improve them apart from scaling them up. Inspired by CheckList (Ribeiro et al., 2020), we introduce a multi-view evaluation set for numerical reasoning in English, called FERMAT. Instead of reporting a single score on a whole dataset, FERMAT evaluates models on various key numerical reasoning aspects such as number understanding, mathematical operations, and training dependency. Apart from providing a comprehensive evaluation of models on different numerical reasoning aspects, FERMAT enables a systematic and automated generation of an arbitrarily large training or evaluation set for each aspect.The datasets and codes are publicly available to generate further multi-view data for ulterior tasks and languages.",
+    "authors": [
+      "Jasivan Sivakumar",
+      "Nafise Sadat Moosavi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.838",
+    "point2d": [
+      43.12709045410156,
+      -20.88669204711914
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 840,
+    "title": "Don\u2019t Forget Your ABC\u2019s: Evaluating the State-of-the-Art in Chat-Oriented Dialogue Systems",
+    "abstract": "Despite tremendous advancements in dialogue systems, stable evaluation still requires human judgments producing notoriously high-variance metrics due to their inherent subjectivity.Moreover, methods and labels in dialogue evaluation are not fully standardized, especially for open-domain chats, with a lack of work to compare and assess the validity of those approaches.The use of inconsistent evaluation can misinform the performance of a dialogue system, which becomes a major hurdle to enhance it.Thus, a dimensional evaluation of chat-oriented open-domain dialogue systems that reliably measures several aspects of dialogue capabilities is desired.This paper presents a novel human evaluation method to estimate the rates of many{pasted macro \u2018LN\u2019} dialogue system behaviors.Our method is used to evaluate four state-of-the-art open-domain dialogue systems and compared with existing approaches.The analysis demonstrates that our behavior method is more suitable than alternative Likert-style or comparative approaches for dimensional evaluation of these systems.",
+    "authors": [
+      "Sarah E. Finch",
+      "James D. Finch",
+      "Jinho D. Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.839",
+    "point2d": [
+      15.879292488098145,
+      69.00788116455078
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 841,
+    "title": "Decoder Tuning: Efficient Language Understanding as Decoding",
+    "abstract": "With the evergrowing sizes of pre-trained models (PTMs), it has been an emerging practice to only provide the inference APIs for users, namely model-as-a-service (MaaS) setting. To adapt PTMs with model parameters frozen, most current approaches focus on the input side, seeking powerful prompts to stimulate models for correct answers. However, we argue that input-side adaptation could be arduous due to the lack of gradient signals and they usually require thousands of API queries, resulting in high computation and time costs. Specifically, DecT first extracts prompt-stimulated output scores for initial predictions. On top of that, we train an additional decoder network on the output representations to incorporate posterior data knowledge. By gradient-based optimization, DecT can be trained within several seconds and requires only one PTM query per sample. Empirically, we conduct extensive natural language understanding experiments and show that DecT significantly outperforms state-of-the-art algorithms with a 200x speed-up. Our code is available at https://github.com/thunlp/DecT.",
+    "authors": [
+      "Ganqu Cui",
+      "Wentao Li",
+      "Ning Ding",
+      "Longtao Huang",
+      "Zhiyuan Liu",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.840",
+    "point2d": [
+      -22.44401741027832,
+      -12.113551139831543
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 842,
+    "title": "The KITMUS Test: Evaluating Knowledge Integration from Multiple Sources",
+    "abstract": "Many state-of-the-art natural language understanding (NLU) models are based on pretrained neural language models. These models often make inferences using information from multiple sources. An important class of such inferences are those that require both background knowledge, presumably contained in a model\u2019s pretrained parameters, and instance-specific information that is supplied at inference time. However, the integration and reasoning abilities of NLU models in the presence of multiple knowledge sources have been largely understudied. In this work, we propose a test suite of coreference resolution subtasks that require reasoning over multiple facts. These subtasks differ in terms of which knowledge sources contain the relevant facts. We also introduce subtasks where knowledge is present only at inference time using fictional knowledge. We evaluate state-of-the-art coreference resolution models on our dataset. Our results indicate that several models struggle to reason on-the-fly over knowledge observed both at pretrain time and at inference time. However, with task-specific training, a subset of models demonstrates the ability to integrate certain knowledge types from multiple sources. Still, even the best performing models seem to have difficulties with reliably integrating knowledge presented only at inference time.",
+    "authors": [
+      "Akshatha Arodi",
+      "Martin P\u00f6msl",
+      "Kaheer Suleman",
+      "Adam Trischler",
+      "Alexandra Olteanu",
+      "Jackie Chi Kit Cheung"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.841",
+    "point2d": [
+      48.675315856933594,
+      -13.580045700073242
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 843,
+    "title": "CREST: A Joint Framework for Rationalization and Counterfactual Text Generation",
+    "abstract": "Selective rationales and counterfactual examples have emerged as two effective, complementary classes of interpretability methods for analyzing and training NLP models. However, prior work has not explored how these methods can be integrated to combine their complementary advantages. We overcome this limitation by introducing CREST (ContRastive Edits with Sparse raTionalization), a joint framework for selective rationalization and counterfactual text generation, and show that this framework leads to improvements in counterfactual quality, model robustness, and interpretability. First, CREST generates valid counterfactuals that are more natural than those produced by previous methods, and subsequently can be used for data augmentation at scale, reducing the need for human-generated examples. Second, we introduce a new loss function that leverages CREST counterfactuals to regularize selective rationales and show that this regularization improves both model robustness and rationale quality, compared to methods that do not leverage CREST counterfactuals. Our results demonstrate that CREST successfully bridges the gap between selective rationales and counterfactual examples, addressing the limitations of existing methods and providing a more comprehensive view of a model\u2019s predictions.",
+    "authors": [
+      "Marcos Treviso",
+      "Alexis Ross",
+      "Nuno M. Guerreiro",
+      "Andr\u00e9 Martins"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.842",
+    "point2d": [
+      29.8196964263916,
+      -4.054169178009033
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 844,
+    "title": "Towards Unifying Multi-Lingual and Cross-Lingual Summarization",
+    "abstract": "To adapt text summarization to the multilingual world, previous work proposes multi-lingual summarization (MLS) and cross-lingual summarization (CLS). However, these two tasks have been studied separately due to the different definitions, which limits the compatible and systematic research on both of them. In this paper, we aim to unify MLS and CLS into a more general setting, i.e., many-to-many summarization (M2MS), where a single model could process documents in any language and generate their summaries also in any language. As the first step towards M2MS, we conduct preliminary studies to show that M2MS can better transfer task knowledge across different languages than MLS and CLS. Furthermore, we propose Pisces, a pre-trained M2MS model that learns language modeling, cross-lingual ability and summarization ability via three-stage pre-training. Experimental results indicate that our Pisces significantly outperforms the state-of-the-art baselines, especially in the zero-shot directions, where there is no training data from the source-language documents to the target-language summaries.",
+    "authors": [
+      "Jiaan Wang",
+      "Fandong Meng",
+      "Duo Zheng",
+      "Yunlong Liang",
+      "Zhixu Li",
+      "Jianfeng Qu",
+      "Jie Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.843",
+    "point2d": [
+      -12.212258338928223,
+      40.37117004394531
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 845,
+    "title": "On Improving Summarization Factual Consistency from Natural Language Feedback",
+    "abstract": "Despite the recent progress in language generation models, their outputs may not always meet user expectations. In this work, we study whether informational feedback in natural language can be leveraged to improve generation quality and user preference alignment. To this end, we consider factual consistency in summarization, the quality that the summary should only contain information supported by the input documents, as the user-expected preference. We collect a high-quality dataset, DeFacto, containing human demonstrations and informational natural language feedback consisting of corrective instructions, edited summaries, and explanations with respect to the factual consistency of the summary. Using our dataset, we study three natural language generation tasks: (1) editing a summary by following the human feedback, (2) generating human feedback for editing the original summary, and (3) revising the initial summary to correct factual errors by generating both the human feedback and edited summary. We show that DeFacto can provide factually consistent human-edited summaries and further insights into summarization factual consistency thanks to its informational natural language feedback. We further demonstrate that fine-tuned language models can leverage our dataset to improve the summary factual consistency, while large language models lack the zero-shot learning ability in our proposed tasks that require controllable text generation.",
+    "authors": [
+      "Yixin Liu",
+      "Budhaditya Deb",
+      "Milagro Teruel",
+      "Aaron Halfaker",
+      "Dragomir Radev",
+      "Ahmed Hassan Awadallah"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.844",
+    "point2d": [
+      -3.9246504306793213,
+      45.8350944519043
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 846,
+    "title": "From Dogwhistles to Bullhorns: Unveiling Coded Rhetoric with Language Models",
+    "abstract": "Dogwhistles are coded expressions that simultaneously convey one meaning to a broad audience and a second, often hateful or provocative, meaning to a narrow in-group; they are deployed to evade both political repercussions and algorithmic content moderation. For example, the word \u201ccosmopolitan\u201d in a sentence such as \u201cwe need to end the cosmopolitan experiment\u201d can mean \u201cworldly\u201d to many but also secretly mean \u201cJewish\u201d to a select few. We present the first large-scale computational investigation of dogwhistles. We develop a typology of dogwhistles, curate the largest-to-date glossary of over 300 dogwhistles with rich contextual information and examples, and analyze their usage in historical U.S. politicians\u2019 speeches. We then assess whether a large language model (GPT-3) can identify dogwhistles and their meanings, and find that GPT-3\u2019s performance varies widely across types of dogwhistles and targeted groups. Finally, we show that harmful content containing dogwhistles avoids toxicity detection, highlighting online risks presented by such coded language. This work sheds light on the theoretical and applied importance of dogwhistles in both NLP and computational social science, and provides resources to facilitate future research in modeling dogwhistles and mitigating their online harms.",
+    "authors": [
+      "Julia Mendelsohn",
+      "Ronan Le Bras",
+      "Yejin Choi",
+      "Maarten Sap"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.845",
+    "point2d": [
+      26.757980346679688,
+      30.47288703918457
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 847,
+    "title": "Exploring Large Language Models for Classical Philology",
+    "abstract": "Recent advances in NLP have led to the creation of powerful language models for many languages including Ancient Greek and Latin. While prior work on Classical languages unanimously uses BERT, in this work we create four language models for Ancient Greek that vary along two dimensions to study their versatility for tasks of interest for Classical languages: we explore (i) encoder-only and encoder-decoder architectures using RoBERTa and T5 as strong model types, and create for each of them (ii) a monolingual Ancient Greek and a multilingual instance that includes Latin and English. We evaluate all models on morphological and syntactic tasks, including lemmatization, which demonstrates the added value of T5\u2019s decoding abilities. We further define two probing tasks to investigate the knowledge acquired by models pre-trained on Classical texts. Our experiments provide the first benchmarking analysis of existing models of Ancient Greek. Results show that our models provide significant improvements over the SoTA. The systematic analysis of model types can inform future research in designing language models for Classical languages, including the development of novel generative tasks. We make all our models available as community resources, along with a large curated pre-training corpus for Ancient Greek, to support the creation of a larger, comparable model zoo for Classical Philology.",
+    "authors": [
+      "Frederick Riemenschneider",
+      "Anette Frank"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.846",
+    "point2d": [
+      -41.53806686401367,
+      -46.838523864746094
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 848,
+    "title": "LayoutMask: Enhance Text-Layout Interaction in Multi-modal Pre-training for Document Understanding",
+    "abstract": "Visually-rich Document Understanding (VrDU) has attracted much research attention over the past years.Pre-trained models on a large number of document images with transformer-based backbones have led to significant performance gains in this field.The major challenge is how to fusion the different modalities (text, layout, and image) of the documents in a unified model with different pre-training tasks. This paper focuses on improving text-layout interactions and proposes a novel multi-modal pre-training model, LayoutMask.LayoutMask uses local 1D position, instead of global 1D position, as layout input and has two pre-training objectives: (1) Masked Language Modeling: predicting masked tokens with two novel masking strategies; (2) Masked Position Modeling: predicting masked 2D positions to improve layout representation learning.LayoutMask can enhance the interactions between text and layout modalities in a unified model and produce adaptive and robust multi-modal representations for downstream tasks.Experimental results show that our proposed method can achieve state-of-the-art results on a wide variety of VrDU problems, including form understanding, receipt understanding, and document image classification.",
+    "authors": [
+      "Yi Tu",
+      "Ya Guo",
+      "Huan Chen",
+      "Jinyang Tang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.847",
+    "point2d": [
+      -49.780517578125,
+      45.79033279418945
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 849,
+    "title": "Hearing Lips in Noise: Universal Viseme-Phoneme Mapping and Transfer for Robust Audio-Visual Speech Recognition",
+    "abstract": "Audio-visual speech recognition (AVSR) provides a promising solution to ameliorate the noise-robustness of audio-only speech recognition with visual information. However, most existing efforts still focus on audio modality to improve robustness considering its dominance in AVSR task, with noise adaptation techniques such as front-end denoise processing. Though effective, these methods are usually faced with two practical challenges: 1) lack of sufficient labeled noisy audio-visual training data in some real-world scenarios and 2) less optimal model generality to unseen testing noises. In this work, we investigate the noise-invariant visual modality to strengthen robustness of AVSR, which can adapt to any testing noises while without dependence on noisy training data, a.k.a., unsupervised noise adaptation. Inspired by human perception mechanism, we propose a universal viseme-phoneme mapping (UniVPM) approach to implement modality transfer, which can restore clean audio from visual signals to enable speech recognition under any noisy conditions. Extensive experiments on public benchmarks LRS3 and LRS2 show that our approach achieves the state-of-the-art under various noisy as well as clean conditions. In addition, we also outperform previous state-of-the-arts on visual speech recognition task.",
+    "authors": [
+      "Yuchen Hu",
+      "Ruizhe Li",
+      "Chen Chen",
+      "Chengwei Qin",
+      "Qiu-Shi Zhu",
+      "Eng Siong Chng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.848",
+    "point2d": [
+      -67.21118927001953,
+      25.870914459228516
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 850,
+    "title": "An Extensible Plug-and-Play Method for Multi-Aspect Controllable Text Generation",
+    "abstract": "Recently, multi-aspect controllable text generation that controls the generated text in multiple aspects (e.g., sentiment, topic, and keywords) has attracted increasing attention. Although methods based on parameter efficient tuning like prefix-tuning could achieve multi-aspect controlling in a plug-and-play way, the mutual interference of multiple prefixes leads to significant degeneration of constraints and limits their extensibility to training-time unseen aspect combinations. In this work, we provide a theoretical lower bound for the interference and empirically found that the interference grows with the number of layers where prefixes are inserted. Based on these analyses, we propose using trainable gates to normalize the intervention of prefixes to restrain the growing interference. As a result, controlling training-time unseen combinations of aspects can be realized by simply concatenating corresponding plugins such that new constraints can be extended at a lower cost. In addition, we propose a unified way to process both categorical and free-form constraints. Experiments on text generation and machine translation demonstrate the superiority of our approach over baselines on constraint accuracy, text quality, and extensibility.",
+    "authors": [
+      "Xuancheng Huang",
+      "Zijun Liu",
+      "Peng Li",
+      "Tao Li",
+      "Maosong Sun",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.849",
+    "point2d": [
+      -25.42005729675293,
+      6.749478340148926
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 851,
+    "title": "Double-Branch Multi-Attention based Graph Neural Network for Knowledge Graph Completion",
+    "abstract": "Graph neural networks (GNNs), which effectively use topological structures in the knowledge graphs (KG) to embed entities and relations in low-dimensional spaces, have shown great power in knowledge graph completion (KGC). KG has abundant global and local structural information, however, many GNN-based KGC models cannot capture these two types of information about the graph structure by designing complex aggregation schemes, and are not designed well to learn representations of seen entities with sparse neighborhoods in isolated subgraphs. In this paper, we find that a simple attention-based method can outperform a general GNN-based approach for KGC. We then propose a double-branch multi-attention based graph neural network (MA-GNN) to learn more expressive entity representations which contain rich global-local structural information. Specifically, we first explore the graph attention network-based local aggregator to learn entity representations. Furthermore, we propose a snowball local attention mechanism by leveraging the semantic similarity between two-hop neighbors to enrich the entity embedding. Finally, we use Transformer-based self-attention to learn long-range dependence between entities to obtain richer representations with the global graph structure and entity features. Experimental results on five benchmark datasets show that MA-GNN achieves significant improvements over strong baselines for inductive KGC.",
+    "authors": [
+      "Hongcai Xu",
+      "Junpeng Bao",
+      "Wenbo Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.850",
+    "point2d": [
+      54.951419830322266,
+      -65.23658752441406
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 852,
+    "title": "Dual Cache for Long Document Neural Coreference Resolution",
+    "abstract": "Recent works show the effectiveness of cache-based neural coreference resolution models on long documents. These models incrementally process a long document from left to right and extract relations between mentions and entities in a cache, resulting in much lower memory and computation cost compared to computing all mentions in parallel. However, they do not handle cache misses when high-quality entities are purged from the cache, which causes wrong assignments and leads to prediction errors. We propose a new hybrid cache that integrates two eviction policies to capture global and local entities separately, and effectively reduces the aggregated cache misses up to half as before, while improving F1 score of coreference by 0.7 5.7pt. As such, the hybrid policy can accelerate existing cache-based models and offer a new long document coreference resolution solution. Results show that our method outperforms existing methods on four benchmarks while saving up to 83% of inference time against non-cache-based models. Further, we achieve a new state-of-the-art on a long document coreference benchmark, LitBank.",
+    "authors": [
+      "Qipeng Guo",
+      "Xiangkun Hu",
+      "Yue Zhang",
+      "Xipeng Qiu",
+      "Zheng Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.851",
+    "point2d": [
+      40.935462951660156,
+      -77.00550842285156
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 853,
+    "title": "Knowledge Transfer in Incremental Learning for Multilingual Neural Machine Translation",
+    "abstract": "In the real-world scenario, a longstanding goal of multilingual neural machine translation (MNMT) is that a single model can incrementally adapt to new language pairs without accessing previous training data. In this scenario, previous studies concentrate on overcoming catastrophic forgetting while lacking encouragement to learn new knowledge from incremental language pairs, especially when the incremental language is not related to the set of original languages. To better acquire new knowledge, we propose a knowledge transfer method that can efficiently adapt original MNMT models to diverse incremental language pairs. The method flexibly introduces the knowledge from an external model into original models, which encourages the models to learn new language pairs, completing the procedure of knowledge transfer. Moreover, all original parameters are frozen to ensure that translation qualities on original language pairs are not degraded. Experimental results show that our method can learn new knowledge from diverse language pairs incrementally meanwhile maintaining performance on original language pairs, outperforming various strong baselines in incremental learning for MNMT.",
+    "authors": [
+      "Kaiyu Huang",
+      "Peng Li",
+      "Jin Ma",
+      "Ting Yao",
+      "Yang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.852",
+    "point2d": [
+      -63.017662048339844,
+      -14.355939865112305
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 854,
+    "title": "DisorBERT: A Double Domain Adaptation Model for Detecting Signs of Mental Disorders in Social Media",
+    "abstract": "Mental disorders affect millions of people worldwide and cause interference with their thinking and behavior. Through the past years, awareness created by health campaigns and other sources motivated the study of these disorders using information extracted from social media platforms. In this work, we aim to contribute to the study of these disorders and to the understanding of how mental problems reflect on social media. To achieve this goal, we propose a double-domain adaptation of a language model. First, we adapted the model to social media language, and then, we adapted it to the mental health domain. In both steps, we incorporated a lexical resource to guide the masking process of the language model and, therefore, to help it in paying more attention to words related to mental disorders. We have evaluated our model in the detection of signs of three major mental disorders: Anorexia, Self-harm, and Depression. Results are encouraging as they show that the proposed adaptation enhances the classification performance and yields competitive results against state-of-the-art methods.",
+    "authors": [
+      "Mario Aragon",
+      "Adrian Pastor Lopez Monroy",
+      "Luis Gonzalez",
+      "David E. Losada",
+      "Manuel Montes"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.853",
+    "point2d": [
+      33.68101119995117,
+      73.46095275878906
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 855,
+    "title": "Toward Interactive Dictation",
+    "abstract": "Voice dictation is an increasingly important text input modality. Existing systems that allow both dictation and editing-by-voice restrict their command language to flat templates invoked by trigger words. In this work, we study the feasibility of allowing users to interrupt their dictation with spoken editing commands in open-ended natural language. We introduce a new task and dataset, TERTiUS, to experiment with such systems. To support this flexibility in real-time, a system must incrementally segment and classify spans of speech as either dictation or command, and interpret the spans that are commands. We experiment with using large pre-trained language models to predict the edited text, or alternatively, to predict a small text-editing program. Experiments show a natural trade-off between model accuracy and latency: a smaller model achieves 30% end-state accuracy with 1.3 seconds of latency, while a larger model achieves 55% end-state accuracy with 7 seconds of latency.",
+    "authors": [
+      "Belinda Z. Li",
+      "Jason Eisner",
+      "Adam Pauls",
+      "Sam Thomson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.854",
+    "point2d": [
+      -24.41715431213379,
+      35.046749114990234
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 856,
+    "title": "CodeIE: Large Code Generation Models are Better Few-Shot Information Extractors",
+    "abstract": "Large language models (LLMs) pre-trained on massive corpora have demonstrated impressive few-shot learning ability on many NLP tasks. A common practice is to recast the task into a text-to-text format such that generative LLMs of natural language (NL-LLMs) like GPT-3 can be prompted to solve it. However, it is nontrivial to perform information extraction (IE) tasks with NL-LLMs since the output of the IE task is usually structured and therefore is hard to be converted into plain text. In this paper, we propose to recast the structured output in the form of code instead of natural language and utilize generative LLMs of code (Code-LLMs) such as Codex to perform IE tasks, in particular, named entity recognition and relation extraction. In contrast to NL-LLMs, we show that Code-LLMs can be well-aligned with these IE tasks by designing code-style prompts and formulating these IE tasks as code generation tasks. Experiment results on seven benchmarks show that our method consistently outperforms fine-tuning moderate-size pre-trained models specially designed for IE tasks (e.g., UIE) and prompting NL-LLMs under few-shot settings. We further conduct a series of in-depth analyses to demonstrate the merits of leveraging Code-LLMs for IE tasks.",
+    "authors": [
+      "Peng Li",
+      "Tianxiang Sun",
+      "Qiong Tang",
+      "Hang Yan",
+      "Yuanbin Wu",
+      "Xuanjing Huang",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.855",
+    "point2d": [
+      -8.758045196533203,
+      -49.034942626953125
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 857,
+    "title": "Beyond English-Centric Bitexts for Better Multilingual Language Representation Learning",
+    "abstract": "In this paper, we elaborate upon recipes for building multilingual representation models that are not only competitive with existing state-of-the-art models but are also more parameter efficient, thereby promoting better adoption in resource-constrained scenarios and practical applications. We show that going beyond English-centric bitexts, coupled with a novel sampling strategy aimed at reducing under-utilization of training data, substantially boosts performance across model sizes for both Electra and MLM pre-training objectives. We introduce XY-LENT: X-Y bitext enhanced Language ENcodings using Transformers which not only achieves state-of-the-art performance over 5 cross-lingual tasks within all model size bands, is also competitive across bands. Our XY-LENT XL variant outperforms XLM-R XXL and exhibits competitive performance with mT5 XXL while being 5x and 6x smaller respectively. We then show that our proposed method helps ameliorate the curse of multilinguality, with the XY-LENT XL achieving 99.3% GLUE performance and 98.5% SQuAD 2.0 performance compared to a SoTA English only model in the same size band. We then analyze our models performance on extremely low resource languages and posit that scaling alone may not be sufficient for improving the performance in this scenario",
+    "authors": [
+      "Barun Patra",
+      "Saksham Singhal",
+      "Shaohan Huang",
+      "Zewen Chi",
+      "Li Dong",
+      "Furu Wei",
+      "Vishrav Chaudhary",
+      "Xia Song"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.856",
+    "point2d": [
+      -53.57884979248047,
+      -11.286002159118652
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 858,
+    "title": "Bridging The Gap: Entailment Fused-T5 for Open-retrieval Conversational Machine Reading Comprehension",
+    "abstract": "Open-retrieval conversational machine reading comprehension (OCMRC) simulates real-life conversational interaction scenes. Machines are required to make a decision of \u201cYes/No/Inquire\u201d or generate a follow-up question when the decision is \u201cInquire\u201d based on retrieved rule texts, user scenario, user question and dialogue history. Recent studies try to reduce the information gap between decision-making and question generation, in order to improve the performance of generation. However, the information gap still persists because these methods are still limited in pipeline framework, where decision-making and question generation are performed separately, making it hard to share the entailment reasoning used in decision-making across all stages. To tackle the above problem, we propose a novel one-stage end-to-end framework, called Entailment Fused-T5 (EFT), to bridge the information gap between decision-making and question generation in a global understanding manner. The extensive experimental results demonstrate that our proposed framework achieves new state-of-the-art performance on the OR-ShARC benchmark. Our model and code are publicly available at an anonymous link.",
+    "authors": [
+      "Xiao Zhang",
+      "Heyan Huang",
+      "Zewen Chi",
+      "Xian-Ling Mao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.857",
+    "point2d": [
+      67.21776580810547,
+      6.471963882446289
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 859,
+    "title": "LiveChat: A Large-Scale Personalized Dialogue Dataset Automatically Constructed from Live Streaming",
+    "abstract": "Open-domain dialogue systems have made promising progress in recent years. While the state-of-the-art dialogue agents are built upon large-scale social media data and large pre-trained models, there is no guarantee these agents could also perform well in fast-growing scenarios, such as live streaming, due to the bounded transferability of pre-trained models and biased distributions of public datasets from Reddit and Weibo, etc. To improve the essential capability of responding and establish a benchmark in the live open-domain scenario, we introduce the LiveChat dataset, composed of 1.33 million real-life Chinese dialogues with almost 3800 average sessions across 351 personas and fine-grained profiles for each persona. LiveChat is automatically constructed by processing numerous live videos on the Internet and naturally falls within the scope of multi-party conversations, where the issues of Who says What to Whom should be considered. Therefore, we target two critical tasks of response modeling and addressee recognition and propose retrieval-based baselines grounded on advanced techniques. Experimental results have validated the positive effects of leveraging persona profiles and larger average sessions per persona. In addition, we also benchmark the transferability of advanced generation-based models on LiveChat and pose some future directions for current challenges.",
+    "authors": [
+      "Jingsheng Gao",
+      "Yixin Lian",
+      "Ziyi Zhou",
+      "Yuzhuo Fu",
+      "Baoyuan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.858",
+    "point2d": [
+      11.55048942565918,
+      63.686195373535156
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 860,
+    "title": "Prompting PaLM for Translation: Assessing Strategies and Performance",
+    "abstract": "Large language models (LLMs) that have been trained on multilingual but not parallel text exhibit a remarkable ability to translate between languages. We probe this ability in an in-depth study of the pathways language model (PaLM), which has demonstrated the strongest machine translation (MT) performance among similarly-trained LLMs to date. We investigate various strategies for choosing translation examples for few-shot prompting, concluding that example quality is the most important factor. Using optimized prompts, we revisit previous assessments of PaLM\u2019s MT capabilities with more recent test sets, modern MT metrics, and human evaluation, and find that its performance, while impressive, still lags that of state-of-the-art supervised systems. We conclude by providing an analysis of PaLM\u2019s MT output which reveals some interesting properties and prospects for future work.",
+    "authors": [
+      "David Vilar",
+      "Markus Freitag",
+      "Colin Cherry",
+      "Jiaming Luo",
+      "Viresh Ratnakar",
+      "George Foster"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.859",
+    "point2d": [
+      -63.73899459838867,
+      -2.6284868717193604
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 861,
+    "title": "Exploring Lottery Prompts for Pre-trained Language Models",
+    "abstract": "Consistently scaling pre-trained language models (PLMs) imposes substantial burdens on model adaptation, necessitating more efficient alternatives to conventional fine-tuning.Given the advantage of prompting in the zero-shot setting and the observed performance fluctuation among different prompts, we explore the instance-level prompt and their generalizability.By searching through the prompt space, we first validate the assumption that for every instance, there is almost always a lottery prompt that induces the correct prediction from the PLM, and such prompt can be obtained at a low cost thanks to the inherent ability of PLMs.Meanwhile, it is shown that some strong lottery prompts have high performance over the whole training set, and they are equipped with distinguishable linguistic features.Lastly, we attempt to generalize the searched strong lottery prompts to unseen data with prompt ensembling method.Experiments are conducted on various types of NLP classification tasks and demonstrate that the proposed method can achieve comparable results with other gradient-free and optimization-free baselines.",
+    "authors": [
+      "Yulin Chen",
+      "Ning Ding",
+      "Xiaobin Wang",
+      "Shengding Hu",
+      "Haitao Zheng",
+      "Zhiyuan Liu",
+      "Pengjun Xie"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.860",
+    "point2d": [
+      -18.150480270385742,
+      -9.276809692382812
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 862,
+    "title": "A Facial Expression-Aware Multimodal Multi-task Learning Framework for Emotion Recognition in Multi-party Conversations",
+    "abstract": "Multimodal Emotion Recognition in Multiparty Conversations (MERMC) has recently attracted considerable attention. Due to the complexity of visual scenes in multi-party conversations, most previous MERMC studies mainly focus on text and audio modalities while ignoring visual information. Recently, several works proposed to extract face sequences as visual features and have shown the importance of visual information in MERMC. However, given an utterance, the face sequence extracted by previous methods may contain multiple people\u2019s faces, which will inevitably introduce noise to the emotion prediction of the real speaker. To tackle this issue, we propose a two-stage framework named Facial expressionaware Multimodal Multi-Task learning (FacialMMT). Specifically, a pipeline method is first designed to extract the face sequence of the real speaker of each utterance, which consists of multimodal face recognition, unsupervised face clustering, and face matching. With the extracted face sequences, we propose a multimodal facial expression-aware emotion recognition model, which leverages the frame-level facial emotion distributions to help improve utterance-level emotion recognition based on multi-task learning. Experiments demonstrate the effectiveness of the proposed FacialMMT framework on the benchmark MELD dataset. The source code is publicly released at https://github.com/NUSTM/FacialMMT.",
+    "authors": [
+      "Wenjie Zheng",
+      "Jianfei Yu",
+      "Rui Xia",
+      "Shijin Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.861",
+    "point2d": [
+      -37.48159408569336,
+      62.520442962646484
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 863,
+    "title": "TeAST: Temporal Knowledge Graph Embedding via Archimedean Spiral Timeline",
+    "abstract": "Temporal knowledge graph embedding (TKGE) models are commonly utilized to infer the missing facts and facilitate reasoning and decision-making in temporal knowledge graph based systems. However, existing methods fuse temporal information into entities, potentially leading to the evolution of entity information and limiting the link prediction performance of TKG. Meanwhile, current TKGE models often lack the ability to simultaneously model important relation patterns and provide interpretability, which hinders their effectiveness and potential applications. To address these limitations, we propose a novel TKGE model which encodes Temporal knowledge graph embeddings via Archimedean Spiral Timeline (TeAST), which maps relations onto the corresponding Archimedean spiral timeline and transforms the quadruples completion to 3th-order tensor completion problem. Specifically, the Archimedean spiral timeline ensures that relations that occur simultaneously are placed on the same timeline, and all relations evolve over time. Meanwhile, we present a novel temporal spiral regularizer to make the spiral timeline orderly. In addition, we provide mathematical proofs to demonstrate the ability of TeAST to encode various relation patterns. Experimental results show that our proposed model significantly outperforms existing TKGE methods. Our code is available at https://github.com/IMU-MachineLearningSXD/TeAST.",
+    "authors": [
+      "Jiang Li",
+      "Xiangdong Su",
+      "Guanglai Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.862",
+    "point2d": [
+      58.760536193847656,
+      -62.17238998413086
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 864,
+    "title": "Human Inspired Progressive Alignment and Comparative Learning for Grounded Word Acquisition",
+    "abstract": "Human language acquisition is an efficient, supervised, and continual process. In this work, we took inspiration from how human babies acquire their first language, and developed a computational process for word acquisition through comparative learning. Motivated by cognitive findings, we generated a small dataset that enables the computation models to compare the similarities and differences of various attributes, learn to filter out and extract the common information for each shared linguistic label. We frame the acquisition of words as not only the information filtration process, but also as representation-symbol mapping. This procedure does not involve a fixed vocabulary size, nor a discriminative objective, and allows the models to continually learn more concepts efficiently. Our results in controlled experiments have shown the potential of this approach for efficient continual learning of grounded words.",
+    "authors": [
+      "Yuwei Bao",
+      "Barrett Lattimer",
+      "Joyce Chai"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.863",
+    "point2d": [
+      -28.357032775878906,
+      -4.1166462898254395
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 865,
+    "title": "Conjunct Lengths in English, Dependency Length Minimization, and Dependency Structure of Coordination",
+    "abstract": "This paper confirms that, in English binary coordinations, left conjuncts tend to be shorter than right conjuncts, regardless of the position of the governor of the coordination. We demonstrate that this tendency becomes stronger when length differences are greater, but only when the governor is on the left or absent, not when it is on the right. We explain this effect via Dependency Length Minimization and we show that this explanation provides support for symmetrical dependency structures of coordination (where coordination is multi-headed by all conjuncts, as in Word Grammar or in enhanced Universal Dependencies, or where it single-headed by the conjunction, as in the Prague Dependency Treebank), as opposed to asymmetrical structures (where coordination is headed by the first conjunct, as in the Meaning\u2013Text Theory or in basic Universal Dependencies).",
+    "authors": [
+      "Adam Przepi\u00f3rkowski",
+      "Micha\u0142 Wo\u017aniak"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.864",
+    "point2d": [
+      -17.306238174438477,
+      -70.97785949707031
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 866,
+    "title": "LeXFiles and LegalLAMA: Facilitating English Multinational Legal Language Model Development",
+    "abstract": "In this work, we conduct a detailed analysis on the performance of legal-oriented pre-trained language models (PLMs). We examine the interplay between their original objective, acquired knowledge, and legal language understanding capacities which we define as the upstream, probing, and downstream performance, respectively. We consider not only the models\u2019 size but also the pre-training corpora used as important dimensions in our study. To this end, we release a multinational English legal corpus (LeXFiles) and a legal knowledge probing benchmark (LegalLAMA) to facilitate training and detailed analysis of legal-oriented PLMs. We release two new legal PLMs trained on LeXFiles and evaluate them alongside others on LegalLAMA and LexGLUE. We find that probing performance strongly correlates with upstream performance in related legal topics. On the other hand, downstream performance is mainly driven by the model\u2019s size and prior legal knowledge which can be estimated by upstream and probing performance. Based on these findings, we can conclude that both dimensions are important for those seeking the development of domain-specific PLMs.",
+    "authors": [
+      "Ilias Chalkidis",
+      "Nicolas Garneau",
+      "Catalina Goanta",
+      "Daniel Katz",
+      "Anders S\u00f8gaard"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.865",
+    "point2d": [
+      -29.094202041625977,
+      -32.549983978271484
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 867,
+    "title": "Revisiting Commonsense Reasoning in Machine Translation: Training, Evaluation and Challenge",
+    "abstract": "The ability of commonsense reasoning (CR) decides whether a neural machine translation (NMT) model can move beyond pattern recognition. Despite the rapid advancement of NMT and the use of pretraining to enhance NMT models, research on CR in NMT is still in its infancy, leaving much to be explored in terms of effectively training NMT models with high CR abilities and devising accurate automatic evaluation metrics. This paper presents a comprehensive study aimed at expanding the understanding of CR in NMT.For the training, we confirm the effectiveness of incorporating pretrained knowledge into NMT models and subsequently utilizing these models as robust testbeds for investigating CR in NMT. For the evaluation, we propose a novel entity-aware evaluation method that takes into account both the NMT candidate and important entities in the candidate, which is more aligned with human judgement. Based on the strong testbed and evaluation methods, we identify challenges in training NMT models with high CR abilities and suggest directions for further unlabeled data utilization and model design. We hope that our methods and findings will contribute to advancing the research of CR in NMT. Source data, code and scripts are freely available at https://github.com/YutongWang1216/CR-NMT.",
+    "authors": [
+      "Xuebo Liu",
+      "Yutong Wang",
+      "Derek F. Wong",
+      "Runzhe Zhan",
+      "Liangxuan Yu",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.866",
+    "point2d": [
+      -64.95286560058594,
+      -6.1253581047058105
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 868,
+    "title": "NOTABLE: Transferable Backdoor Attacks Against Prompt-based NLP Models",
+    "abstract": "Prompt-based learning is vulnerable to backdoor attacks. Existing backdoor attacks against prompt-based models consider injecting backdoors into the entire embedding layers or word embedding vectors. Such attacks can be easily affected by retraining on downstream tasks and with different prompting strategies, limiting the transferability of backdoor attacks. In this work, we propose transferable backdoor attacks against prompt-based models, called NOTABLE, which is independent of downstream tasks and prompting strategies. Specifically, NOTABLE injects backdoors into the encoders of PLMs by utilizing an adaptive verbalizer to bind triggers to specific words (i.e., anchors). It activates the backdoor by pasting input with triggers to reach adversary-desired anchors, achieving independence from downstream tasks and prompting strategies. We conduct experiments on six NLP tasks, three popular models, and three prompting strategies. Empirical results show that NOTABLE achieves superior attack performance (i.e., attack success rate over 90% on all the datasets), and outperforms two state-of-the-art baselines. Evaluations on three defenses show the robustness of NOTABLE. Our code can be found at https://github.com/RU-System-Software-and-Security/Notable.",
+    "authors": [
+      "Kai Mei",
+      "Zheng Li",
+      "Zhenting Wang",
+      "Yang Zhang",
+      "Shiqing Ma"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.867",
+    "point2d": [
+      0.06884396076202393,
+      9.701533317565918
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 869,
+    "title": "Revisiting Relation Extraction in the era of Large Language Models",
+    "abstract": "Relation extraction (RE) is the core NLP task of inferring semantic relationships between entities from text. Standard supervised RE techniques entail training modules to tag tokens comprising entity spans and then predict the relationship between them. Recent work has instead treated the problem as a sequence-to-sequence task, linearizing relations between entities as target strings to be generated conditioned on the input. Here we push the limits of this approach, using larger language models (GPT-3 and Flan-T5 large) than considered in prior work and evaluating their performance on standard RE tasks under varying levels of supervision. We address issues inherent to evaluating generative approaches to RE by doing human evaluations, in lieu of relying on exact matching. Under this refined evaluation, we find that: (1) Few-shot prompting with GPT-3 achieves near SOTA performance, i.e., roughly equivalent to existing fully supervised models; (2) Flan-T5 is not as capable in the few-shot setting, but supervising and fine-tuning it with Chain-of-Thought (CoT) style explanations (generated via GPT-3) yields SOTA results. We release this model as a new baseline for RE tasks.",
+    "authors": [
+      "Somin Wadhwa",
+      "Silvio Amir",
+      "Byron Wallace"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.868",
+    "point2d": [
+      41.60173034667969,
+      -59.95499038696289
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 870,
+    "title": "Pre-trained Language Models Can be Fully Zero-Shot Learners",
+    "abstract": "How can we extend a pre-trained model to many language understanding tasks, without labeled or additional unlabeled data? Pre-trained language models (PLMs) have been effective for a wide range of NLP tasks. However, existing approaches either require fine-tuning on downstream labeled datasets or manually constructing proper prompts. In this paper, we propose nonparametric prompting PLM (NPPrompt) for fully zero-shot language understanding. Unlike previous methods, NPPrompt uses only pre-trained language models and does not require any labeled data or additional raw corpus for further fine-tuning, nor does it rely on humans to construct a comprehensive set of prompt label words. We evaluate NPPrompt against previous major few-shot and zero-shot learning methods on diverse NLP tasks: including text classification, text entailment, similar text retrieval, paraphrasing, and multiple-choice question answering. Experimental results demonstrate that our NPPrompt outperforms the previous best fully zero-shot method by big margins, with absolute gains of 12.8% in accuracy on text classification and 15.6% on the GLUE benchmark. Our source code is available at https://anonymous.4open. science/r/NPPrompt.",
+    "authors": [
+      "Xuandong Zhao",
+      "Siqi Ouyang",
+      "Zhiguo Yu",
+      "Ming Wu",
+      "Lei Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.869",
+    "point2d": [
+      -12.545182228088379,
+      -9.14875602722168
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 871,
+    "title": "Can Large Language Models Be an Alternative to Human Evaluations?",
+    "abstract": "Human evaluation is indispensable and inevitable for assessing the quality of texts generated by machine learning models or written by humans. However, human evaluation is very difficult to reproduce and its quality is notoriously unstable, hindering fair comparisons among different natural language processing (NLP) models and algorithms.Recently, large language models (LLMs) have demonstrated exceptional performance on unseen tasks when only the task instructions are provided.In this paper, we explore if such an ability of the LLMs can be used as an alternative to human evaluation.We present the LLMs with the exact same instructions, samples to be evaluated, and questions used to conduct human evaluation, and then ask the LLMs to generate responses to those questions; we dub this LLM evaluation.We use human evaluation and LLM evaluation to evaluate the texts in two NLP tasks: open-ended story generation and adversarial attacks.We show that the result of LLM evaluation is consistent with the results obtained by expert human evaluation: the texts rated higher by human experts are also rated higher by the LLMs.We also find that the results of LLM evaluation are stable over different formatting of the task instructions and the sampling algorithm used to generate the answer.We are the first to show the potential of using LLMs to assess the quality of texts and discuss the limitations and ethical considerations of LLM evaluation.",
+    "authors": [
+      "Cheng-Han Chiang",
+      "Hung-yi Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.870",
+    "point2d": [
+      3.821570873260498,
+      10.206053733825684
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 872,
+    "title": "HyperMixer: An MLP-based Low Cost Alternative to Transformers",
+    "abstract": "Transformer-based architectures are the model of choice for natural language understanding, but they come at a significant cost, as they have quadratic complexity in the input length, require a lot of training data, and can be difficult to tune. In the pursuit of lower costs, we investigate simple MLP-based architectures. We find that existing architectures such as MLPMixer, which achieves token mixing through a static MLP applied to each feature independently, are too detached from the inductive biases required for natural language understanding. In this paper, we propose a simple variant, HyperMixer, which forms the token mixing MLP dynamically using hypernetworks. Empirically, we demonstrate that our model performs better than alternative MLP-based models, and on par with Transformers. In contrast to Transformers, HyperMixer achieves these results at substantially lower costs in terms of processing time, training data, and hyperparameter tuning.",
+    "authors": [
+      "Florian Mai",
+      "Arnaud Pannatier",
+      "Fabio Fehr",
+      "Haolin Chen",
+      "Francois Marelli",
+      "Francois Fleuret",
+      "James Henderson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.871",
+    "point2d": [
+      -37.23847961425781,
+      -32.97175598144531
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 873,
+    "title": "UnitY: Two-pass Direct Speech-to-speech Translation with Discrete Units",
+    "abstract": "Direct speech-to-speech translation (S2ST), in which all components can be optimized jointly, is advantageous over cascaded approaches to achieve fast inference with a simplified pipeline. We present a novel two-pass direct S2ST architecture, UnitY, which first generates textual representations and predicts discrete acoustic units subsequently. We enhance the model performance by subword prediction in the first-pass decoder, advanced two-pass decoder architecture design and search strategy, and better training regularization. To leverage large amounts of unlabeled text data, we pre-train the first-pass text decoder based on the self-supervised denoising auto-encoding task. Experimental evaluations on benchmark datasets at various data scales demonstrate that UnitY outperforms a single-pass speech-to-unit translation model by 2.5-4.2 ASR-BLEU with 2.83x decoding speed-up. We show that the proposed methods boost the performance even when predicting spectrogram in the second pass. However, predicting discrete units achieves 2.51x decoding speed-up compared to that case.",
+    "authors": [
+      "Hirofumi Inaguma",
+      "Sravya Popuri",
+      "Ilia Kulikov",
+      "Peng-Jen Chen",
+      "Changhan Wang",
+      "Yu-An Chung",
+      "Yun Tang",
+      "Ann Lee",
+      "Shinji Watanabe",
+      "Juan Pino"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.872",
+    "point2d": [
+      -66.63463592529297,
+      18.247966766357422
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 874,
+    "title": "Estimating the Uncertainty in Emotion Attributes using Deep Evidential Regression",
+    "abstract": "In automatic emotion recognition (AER), labels assigned by different human annotators to the same utterance are often inconsistent due to the inherent complexity of emotion and the subjectivity of perception. Though deterministic labels generated by averaging or voting are often used as the ground truth, it ignores the intrinsic uncertainty revealed by the inconsistent labels. This paper proposes a Bayesian approach, deep evidential emotion regression (DEER), to estimate the uncertainty in emotion attributes. Treating the emotion attribute labels of an utterance as samples drawn from an unknown Gaussian distribution, DEER places an utterance-specific normal-inverse gamma prior over the Gaussian likelihood and predicts its hyper-parameters using a deep neural network model. It enables a joint estimation of emotion attributes along with the aleatoric and epistemic uncertainties. AER experiments on the widely used MSP-Podcast and IEMOCAP datasets showed DEER produced state-of-the-art results for both the mean values and the distribution of emotion attributes.",
+    "authors": [
+      "Wen Wu",
+      "Chao Zhang",
+      "Philip Woodland"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.873",
+    "point2d": [
+      -39.55004119873047,
+      64.18511962890625
+    ],
+    "cluster": 23.0
+  },
+  {
+    "idx": 875,
+    "title": "Annotation-Inspired Implicit Discourse Relation Classification with Auxiliary Discourse Connective Generation",
+    "abstract": "Implicit discourse relation classification is a challenging task due to the absence of discourse connectives. To overcome this issue, we design an end-to-end neural model to explicitly generate discourse connectives for the task, inspired by the annotation process of PDTB. Specifically, our model jointly learns to generate discourse connectives between arguments and predict discourse relations based on the arguments and the generated connectives. To prevent our relation classifier from being misled by poor connectives generated at the early stage of training while alleviating the discrepancy between training and inference, we adopt Scheduled Sampling to the joint learning. We evaluate our method on three benchmarks, PDTB 2.0, PDTB 3.0, and PCC. Results show that our joint model significantly outperforms various baselines on three datasets, demonstrating its superiority for the task.",
+    "authors": [
+      "Wei Liu",
+      "Michael Strube"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.874",
+    "point2d": [
+      16.016939163208008,
+      -50.377052307128906
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 876,
+    "title": "Plug-and-Play Document Modules for Pre-trained Models",
+    "abstract": "Large-scale pre-trained models (PTMs) have been widely used in document-oriented NLP tasks, such as question answering. However, the encoding-task coupling requirement results in the repeated encoding of the same documents for different tasks and queries, which is highly computationally inefficient. To this end, we target to decouple document encoding from downstream tasks, and propose to represent each document as a plug-and-play document module, i.e., a document plugin, for PTMs (PlugD). By inserting document plugins into the backbone PTM for downstream tasks, we can encode a document one time to handle multiple tasks, which is more efficient than conventional encoding-task coupling methods that simultaneously encode documents and input queries using task-specific encoders. Extensive experiments on 8 datasets of 4 typical NLP tasks show that PlugD enables models to encode documents once and for all across different scenarios. Especially, PlugD can save 69% computational costs while achieving comparable performance to state-of-the-art encoding-task coupling methods. Additionally, we show that PlugD can serve as an effective post-processing way to inject knowledge into task-specific models, improving model performance without any additional model training. Our code and checkpoints can be found in https://github.com/thunlp/Document-Plugin.",
+    "authors": [
+      "Chaojun Xiao",
+      "Zhengyan Zhang",
+      "Xu Han",
+      "Chi-Min Chan",
+      "Yankai Lin",
+      "Zhiyuan Liu",
+      "Xiangyang Li",
+      "Zhonghua Li",
+      "Zhao Cao",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.875",
+    "point2d": [
+      -22.138463973999023,
+      -26.73198127746582
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 877,
+    "title": "An Empirical Analysis of Parameter-Efficient Methods for Debiasing Pre-Trained Language Models",
+    "abstract": "The increasingly large size of modern pre-trained language models not only makes them inherit more human-like biases from the training corpora, but also makes it computationally expensive to mitigate such biases. In this paper, we investigate recent parameter-efficient methods in combination with counterfactual data augmentation (CDA) for bias mitigation. We conduct extensive experiments with prefix tuning, prompt tuning, and adapter tuning on different language models and bias types to evaluate their debiasing performance and abilities to preserve the internal knowledge of a pre-trained model. We find that the parameter-efficient methods (i) are effective in mitigating gender bias, where adapter tuning is consistently the most effective one and prompt tuning is more suitable for GPT-2 than BERT, (ii) areless effective when it comes to racial and religious bias, which may be attributed to the limitations of CDA, and (iii) can perform similarly to or sometimes better than full fine-tuning with improved time and memory efficiency, as well as maintain the internal knowledge in BERT and GPT-2, evaluated via fact retrieval and downstream fine-tuning.",
+    "authors": [
+      "Zhongbin Xie",
+      "Thomas Lukasiewicz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.876",
+    "point2d": [
+      13.220100402832031,
+      27.75692367553711
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 878,
+    "title": "Two-Stage Fine-Tuning for Improved Bias and Variance for Large Pretrained Language Models",
+    "abstract": "The bias-variance tradeoff is the idea that learning methods need to balance model complexity with data size to minimize both under-fitting and over-fitting. Recent empirical work and theoretical analysis with over-parameterized neural networks challenges the classic bias-variance trade-off notion suggesting that no such trade-off holds: as the width of the network grows, bias monotonically decreases while variance initially increases followed by a decrease. In this work, we first provide a variance decomposition-based justification criteria to examine whether large pretrained neural models in a fine-tuning setting are generalizable enough to have low bias and variance. We then perform theoretical and empirical analysis using ensemble methods explicitly designed to decrease variance due to optimization. This results in essentially a two-stage fine-tuning algorithm that first ratchets down bias and variance iteratively, and then uses a selected fixed-bias model to further reduce variance due to optimization by ensembling. We also analyze the nature of variance change with the ensemble size in low- and high-resource classes. Empirical results show that this two-stage method obtains strong results on SuperGLUE tasks and clinical information extraction tasks. Code and settings are available: https://github.com/christa60/bias-var-fine-tuning-plms.git",
+    "authors": [
+      "Lijing Wang",
+      "Yingya Li",
+      "Timothy Miller",
+      "Steven Bethard",
+      "Guergana Savova"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.877",
+    "point2d": [
+      -31.128494262695312,
+      -13.87012004852295
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 879,
+    "title": "A Comparative Study on the Impact of Model Compression Techniques on Fairness in Language Models",
+    "abstract": "Compression techniques for deep learning have become increasingly popular, particularly in settings where latency and memory constraints are imposed. Several methods, such as pruning, distillation, and quantization, have been adopted for compressing models, each providing distinct advantages. However, existing literature demonstrates that compressing deep learning models could affect their fairness. Our analysis involves a comprehensive evaluation of pruned, distilled, and quantized language models, which we benchmark across a range of intrinsic and extrinsic metrics for measuring bias in text classification. We also investigate the impact of using multilingual models and evaluation measures. Our findings highlight the significance of considering both the pre-trained model and the chosen compression strategy in developing equitable language technologies. The results also indicate that compression strategies can have an adverse effect on fairness measures.",
+    "authors": [
+      "Krithika Ramesh",
+      "Arnav Chavan",
+      "Shrey Pandit",
+      "Sunayana Sitaram"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.878",
+    "point2d": [
+      -51.51919937133789,
+      -13.345586776733398
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 880,
+    "title": "Ranking-Enhanced Unsupervised Sentence Representation Learning",
+    "abstract": "Unsupervised sentence representation learning has progressed through contrastive learning and data augmentation methods such as dropout masking. Despite this progress, sentence encoders are still limited to using only an input sentence when predicting its semantic vector. In this work, we show that the semantic meaning of a sentence is also determined by nearest-neighbor sentences that are similar to the input sentence. Based on this finding, we propose a novel unsupervised sentence encoder, RankEncoder. RankEncoder predicts the semantic vector of an input sentence by leveraging its relationship with other sentences in an external corpus, as well as the input sentence itself. We evaluate RankEncoder on semantic textual benchmark datasets. From the experimental results, we verify that 1) RankEncoder achieves 80.07% Spearman\u2019s correlation, a 1.1% absolute improvement compared to the previous state-of-the-art performance, 2) RankEncoder is universally applicable to existing unsupervised sentence embedding methods, and 3) RankEncoder is specifically effective for predicting the similarity scores of similar sentence pairs.",
+    "authors": [
+      "Yeon Seonwoo",
+      "Guoyin Wang",
+      "Changmin Seo",
+      "Sajal Choudhary",
+      "Jiwei Li",
+      "Xiang Li",
+      "Puyang Xu",
+      "Sunghyun Park",
+      "Alice Oh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.879",
+    "point2d": [
+      2.78848934173584,
+      -34.180213928222656
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 881,
+    "title": "To Revise or Not to Revise: Learning to Detect Improvable Claims for Argumentative Writing Support",
+    "abstract": "Optimizing the phrasing of argumentative text is crucial in higher education and professional development. However, assessing whether and how the different claims in a text should be revised is a hard task, especially for novice writers. In this work, we explore the main challenges to identifying argumentative claims in need of specific revisions. By learning from collaborative editing behaviors in online debates, we seek to capture implicit revision patterns in order to develop approaches aimed at guiding writers in how to further improve their arguments. We systematically compare the ability of common word embedding models to capture the differences between different versions of the same text, and we analyze their impact on various types of writing issues. To deal with the noisy nature of revision-based corpora, we propose a new sampling strategy based on revision distance. Opposed to approaches from prior work, such sampling can be done without employing additional annotations and judgments. Moreover, we provide evidence that using contextual information and domain knowledge can further improve prediction results. How useful a certain type of context is, depends on the issue the claim is suffering from, though.",
+    "authors": [
+      "Gabriella Skitalinskaya",
+      "Henning Wachsmuth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.880",
+    "point2d": [
+      41.70110321044922,
+      40.567684173583984
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 882,
+    "title": "Human-in-the-loop Evaluation for Early Misinformation Detection: A Case Study of COVID-19 Treatments",
+    "abstract": "We present a human-in-the-loop evaluation framework for fact-checking novel misinformation claims and identifying social media messages that support them. Our approach extracts check-worthy claims, which are aggregated and ranked for review. Stance classifiers are then used to identify tweets supporting novel misinformation claims, which are further reviewed to determine whether they violate relevant policies. To demonstrate the feasibility of our approach, we develop a baseline system based on modern NLP methods for human-in-the-loop fact-checking in the domain of COVID-19 treatments. We make our data and detailed annotation guidelines available to support the evaluation of human-in-the-loop systems that identify novel misinformation directly from raw user-generated content.",
+    "authors": [
+      "Ethan Mendes",
+      "Yang Chen",
+      "Wei Xu",
+      "Alan Ritter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.881",
+    "point2d": [
+      31.27888298034668,
+      10.864534378051758
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 883,
+    "title": "Composition-contrastive Learning for Sentence Embeddings",
+    "abstract": "Vector representations of natural language are ubiquitous in search applications. Recently, various methods based on contrastive learning have been proposed to learn textual representations from unlabelled data; by maximizing alignment between minimally-perturbed embeddings of the same text, and encouraging a uniform distribution of embeddings across a broader corpus. Differently, we propose maximizing alignment between texts and a composition of their phrasal constituents. We consider several realizations of this objective and elaborate the impact on representations in each case. Experimental results on semantic textual similarity tasks show improvements over baselines that are comparable with state-of-the-art approaches. Moreover, this work is the first to do so without incurring costs in auxiliary training objectives or additional network parameters.",
+    "authors": [
+      "Sachin Chanchani",
+      "Ruihong Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.882",
+    "point2d": [
+      1.049079418182373,
+      -35.781951904296875
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 884,
+    "title": "Causes and Cures for Interference in Multilingual Translation",
+    "abstract": "Multilingual machine translation models can benefit from synergy between different language pairs, but also suffer from interference. While there is a growing number of sophisticated methods that aim to eliminate interference, our understanding of interference as a phenomenon is still limited. This work identifies the main factors that contribute to interference in multilingual machine translation. Through systematic experimentation, we find that interference (or synergy) are primarily determined by model size, data size, and the proportion of each language pair within the total dataset. We observe that substantial interference occurs mainly when the model is very small with respect to the available training data, and that using standard transformer configurations with less than one billion parameters largely alleviates interference and promotes synergy. Moreover, we show that tuning the sampling temperature to control the proportion of each language pair in the data is key to balancing the amount of interference between low and high resource language pairs effectively, and can lead to superior performance overall.",
+    "authors": [
+      "Uri Shaham",
+      "Maha Elbayad",
+      "Vedanuj Goswami",
+      "Omer Levy",
+      "Shruti Bhosale"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.883",
+    "point2d": [
+      -66.89439392089844,
+      -2.576308250427246
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 885,
+    "title": "Understanding and Bridging the Modality Gap for Speech Translation",
+    "abstract": "How to achieve better end-to-end speech translation (ST) by leveraging (text) machine translation (MT) data? Among various existing techniques, multi-task learning is one of the effective ways to share knowledge between ST and MT in which additional MT data can help to learn source-to-target mapping. However, due to the differences between speech and text, there is always a gap between ST and MT. In this paper, we first aim to understand this modality gap from the target-side representation differences, and link the modality gap to another well-known problem in neural machine translation: exposure bias. We find that the modality gap is relatively small during training except for some difficult cases, but keeps increasing during inference due to the cascading effect. To address these problems, we propose the Cross-modal Regularization with Scheduled Sampling (Cress) method. Specifically, we regularize the output predictions of ST and MT, whose target-side contexts are derived by sampling between ground truth words and self-generated words with a varying probability. Furthermore, we introduce token-level adaptive training which assigns different training weights to target tokens to handle difficult cases with large modality gaps. Experiments and analysis show that our approach effectively bridges the modality gap, and achieves significant improvements over a strong baseline in all eight directions of the MuST-C dataset.",
+    "authors": [
+      "Qingkai Fang",
+      "Yang Feng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.884",
+    "point2d": [
+      -68.75565338134766,
+      21.017412185668945
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 886,
+    "title": "Few-shot Reranking for Multi-hop QA via Language Model Prompting",
+    "abstract": "We study few-shot reranking for multi-hop QA (MQA) with open-domain questions. To alleviate the need for a large number of labeled question-document pairs for retriever training, we propose PromptRank, which relies on language model prompting for multi-hop path reranking. PromptRank first constructs an instruction-based prompt that includes a candidate document path and then computes the relevance score between a given question and the path based on the conditional likelihood of the question given the path prompt according to a language model. PromptRank yields strong retrieval performance on HotpotQA with only 128 training examples compared to state-of-the-art methods trained on thousands of examples \u2014 73.6 recall@10 by PromptRank vs. 77.8 by PathRetriever and 77.5 by multi-hop dense retrieval.",
+    "authors": [
+      "Muhammad Khalifa",
+      "Lajanugen Logeswaran",
+      "Moontae Lee",
+      "Honglak Lee",
+      "Lu Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.885",
+    "point2d": [
+      64.04508209228516,
+      9.392241477966309
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 887,
+    "title": "DICE: Data-Efficient Clinical Event Extraction with Generative Models",
+    "abstract": "Event extraction for the clinical domain is an under-explored research area. The lack of training data along with the high volume of domain-specific terminologies with vague entity boundaries makes the task especially challenging. In this paper, we introduce DICE, a robust and data-efficient generative model for clinical event extraction. DICE frames event extraction as a conditional generation problem and introduces a contrastive learning objective to accurately decide the boundaries of biomedical mentions. DICE also trains an auxiliary mention identification task jointly with event extraction tasks to better identify entity mention boundaries, and further introduces special markers to incorporate identified entity mentions as trigger and argument candidates for their respective tasks. To benchmark clinical event extraction, we compose MACCROBAT-EE, the first clinical event extraction dataset with argument annotation, based on an existing clinical information extraction dataset MACCROBAT. Our experiments demonstrate state-of-the-art performances of DICE for clinical and news domain event extraction, especially under low data settings.",
+    "authors": [
+      "Mingyu Derek Ma",
+      "Alexander Taylor",
+      "Wei Wang",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.886",
+    "point2d": [
+      37.587100982666016,
+      -43.687660217285156
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 888,
+    "title": "XSemPLR: Cross-Lingual Semantic Parsing in Multiple Natural Languages and Meaning Representations",
+    "abstract": "Cross-Lingual Semantic Parsing (CLSP) aims to translate queries in multiple natural languages (NLs) into meaning representations (MRs) such as SQL, lambda calculus, and logic forms.However, existing CLSP models are separately proposed and evaluated on datasets of limited tasks and applications, impeding a comprehensive and unified evaluation of CLSP on a diverse range of NLs and MRs. To this end, we present XSemPLR, a unified benchmark for cross-lingual semantic parsing featured with 22 natural languages and 8 meaning representations by examining and selecting 9 existing datasets to cover 5 tasks and 164 domains. We use XSemPLR to conduct a comprehensive benchmark study on a wide range of multilingual language models including encoder-based models (mBERT, XLM-R), encoder-decoder models (mBART, mT5), and decoder-based models (Codex, BLOOM). We design 6 experiment settings covering various lingual combinations (monolingual, multilingual, cross-lingual) and numbers of learning samples (full dataset, few-shot, and zero-shot). Our experiments show that encoder-decoder models (mT5) achieve the highest performance compared with other popular models, and multilingual training can further improve the average performance. Notably, multilingual large language models (e.g., BLOOM) are still inadequate to perform CLSP tasks. We also find that the performance gap between monolingual training and cross-lingual transfer learning is still significant for multilingual models, though it can be mitigated by cross-lingual few-shot training. Our dataset and code are available at https://github.com/psunlpgroup/XSemPLR.",
+    "authors": [
+      "Yusen Zhang",
+      "Jun Wang",
+      "Zhiguo Wang",
+      "Rui Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.887",
+    "point2d": [
+      -30.124177932739258,
+      -58.45423889160156
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 889,
+    "title": "INK: Injecting kNN Knowledge in Nearest Neighbor Machine Translation",
+    "abstract": "Neural machine translation has achieved promising results on many translation tasks. However, previous studies have shown that neural models induce a non-smooth representation space, which harms its generalization results. Recently, kNN-MT has provided an effective paradigm to smooth the prediction based on neighbor representations during inference. Despite promising results, kNN-MT usually requires large inference overhead. We propose an effective training framework INK to directly smooth the representation space via adjusting representations of kNN neighbors with a small number of new parameters. The new parameters are then used to refresh the whole representation datastore to get new kNN knowledge asynchronously. This loop keeps running until convergence. Experiments on four benchmark datasets show that INK achieves average gains of 1.99 COMET and 1.0 BLEU, outperforming the state-of-the-art kNN-MT system with 0.02x memory space and 1.9x inference speedup.",
+    "authors": [
+      "Wenhao Zhu",
+      "Jingjing Xu",
+      "Shujian Huang",
+      "Lingpeng Kong",
+      "Jiajun Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.888",
+    "point2d": [
+      -65.6578369140625,
+      -13.899858474731445
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 890,
+    "title": "Uncertainty Guided Label Denoising for Document-level Distant Relation Extraction",
+    "abstract": "Document-level relation extraction (DocRE) aims to infer complex semantic relations among entities in a document. Distant supervision (DS) is able to generate massive auto-labeled data, which can improve DocRE performance. Recent works leverage pseudo labels generated by the pre-denoising model to reduce noise in DS data. However, unreliable pseudo labels bring new noise, e.g., adding false pseudo labels and losing correct DS labels. Therefore, how to select effective pseudo labels to denoise DS data is still a challenge in document-level distant relation extraction. To tackle this issue, we introduce uncertainty estimation technology to determine whether pseudo labels can be trusted. In this work, we propose a Document-level distant Relation Extraction framework with Uncertainty Guided label denoising, UGDRE. Specifically, we propose a novel instance-level uncertainty estimation method, which measures the reliability of the pseudo labels with overlapping relations. By further considering the long-tail problem, we design dynamic uncertainty thresholds for different types of relations to filter high-uncertainty pseudo labels. We conduct experiments on two public datasets. Our framework outperforms strong baselines by 1.91 F1 and 2.28 Ign F1 on the RE-DocRED dataset.",
+    "authors": [
+      "Qi Sun",
+      "Kun Huang",
+      "Xiaocui Yang",
+      "Pengfei Hong",
+      "Kun Zhang",
+      "Soujanya Poria"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.889",
+    "point2d": [
+      38.607818603515625,
+      -66.35653686523438
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 891,
+    "title": "Cross-Modal Attribute Insertions for Assessing the Robustness of Vision-and-Language Learning",
+    "abstract": "The robustness of multimodal deep learning models to realistic changes in the input text is critical for applicability on important tasks such as text-to-image retrieval and cross-modal entailment. To measure robustness, several existing approaches edit the text data, but without leveraging the cross-modal information present in multimodal data. Such information from the visual modality, such as color, size, and shape, provides additional attributes that users can include in their inputs. Thus, we propose cross-modal attribute insertions as a realistic perturbation strategy for vision-and-language data that inserts visual attributes of the objects in the image into the corresponding text (e.g., \u201cgirl on a chair\u201d to \u201clittle girl on a wooden chair\u201d). Our proposed approach for cross-modal attribute insertions is modular, controllable, and task-agnostic. We find that augmenting input text using cross-modal insertions causes state-of-the-art approaches for text-to-image retrieval and cross-modal entailment to perform poorly, resulting in relative drops of ~15% in MRR and ~20% in F1 score, respectively. Crowd-sourced annotations demonstrate that cross-modal insertions lead to higher quality augmentations for multimodal data than augmentations using text-only data, and are equivalent in quality to original examples. We release the code to encourage robustness evaluations of deep vision-and-language models: https://github.com/claws-lab/multimodal-robustness-xmai",
+    "authors": [
+      "Shivaen Ramshetty",
+      "Gaurav Verma",
+      "Srijan Kumar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.890",
+    "point2d": [
+      -58.79741287231445,
+      33.62810134887695
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 892,
+    "title": "Crosslingual Generalization through Multitask Finetuning",
+    "abstract": "Multitask prompted finetuning (MTF) has been shown to help large language models generalize to new tasks in a zero-shot setting, but so far explorations of MTF have focused on English data and models. We apply MTF to the pretrained multilingual BLOOM and mT5 model families to produce finetuned variants called BLOOMZ and mT0. We find finetuning large multilingual language models on English tasks with English prompts allows for task genrealization to non-English languages that appear only in the pretraining corpus. Finetuning on multilingual tasks with English prompts further improves performance on English and non-English tasks leading to various state-of-the-art zero-shot results. We also investigate finetuning on multilingual tasks with prompts that have been machine-translated from English to match the language of each dataset. We find training on these machine-translated prompts leads to better performance on human-written prompts in the respective languages. Surprisingly, we find models are capable of zero-shot generalization to tasks in languages they have never intentionally seen. We conjecture that the models are learning higher-level capabilities that are both task- and language-agnostic. In addition, we introduce xP3, a composite of supervised datasets in 46 languages with English and machine-translated prompts. Our code, datasets and models are freely available at https://github.com/ bigscience-workshop/xmtf.",
+    "authors": [
+      "Niklas Muennighoff",
+      "Thomas Wang",
+      "Lintang Sutawika",
+      "Adam Roberts",
+      "Stella Biderman",
+      "Teven Le Scao",
+      "M Saiful Bari",
+      "Sheng Shen",
+      "Zheng Xin Yong",
+      "Hailey Schoelkopf",
+      "Xiangru Tang",
+      "Dragomir Radev",
+      "Alham Fikri Aji",
+      "Khalid Almubarak",
+      "Samuel Albanie",
+      "Zaid Alyafeai",
+      "Albert Webson",
+      "Edward Raff",
+      "Colin Raffel"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.891",
+    "point2d": [
+      -19.373493194580078,
+      -10.917952537536621
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 893,
+    "title": "Evaluate AMR Graph Similarity via Self-supervised Learning",
+    "abstract": "In work on AMR (Abstract Meaning Representation), similarity metrics are crucial as they are used to evaluate AMR systems such as AMR parsers. Current AMR metrics are all based on nodes or triples matching without considering the entire structures of AMR graphs. To address this problem, and inspired by learned similarity evaluation on plain text, we propose AMRSim, an automatic AMR graph similarity evaluation metric. To overcome the high cost of collecting human-annotated data, AMRSim automatically generates silver AMR graphs and utilizes self-supervised learning methods. We evaluated AMRSim on various datasets and found that AMRSim significantly improves the correlations with human semantic scores and remains robust under diverse challenges. We also discuss how AMRSim can be extended to multilingual cases.",
+    "authors": [
+      "Ziyi Shou",
+      "Fangzhen Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.892",
+    "point2d": [
+      40.483699798583984,
+      -55.360965728759766
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 894,
+    "title": "Analyzing Transformers in Embedding Space",
+    "abstract": "Understanding Transformer-based models has attracted significant attention, as they lie at the heart of recent technological advances across machine learning. While most interpretability methods rely on running models over inputs, recent work has shown that a zero-pass approach, where parameters are interpreted directly without a forward/backward pass is feasible for some Transformer parameters, and for two-layer attention networks. In this work, we present a theoretical analysis where all parameters of a trained Transformer are interpreted by projecting them into the embedding space, that is, the space of vocabulary items they operate on. We derive a simple theoretical framework to support our arguments and provide ample evidence for its validity. First, an empirical analysis showing that parameters of both pretrained and fine-tuned models can be interpreted in embedding space. Second, we present two applications of our framework: (a) aligning the parameters of different models that share a vocabulary, and (b) constructing a classifier without training by \u201ctranslating\u201d the parameters of a fine-tuned classifier to parameters of a different model that was only pretrained. Overall, our findings open the door to interpretation methods that, at least in part, abstract away from model specifics and operate in the embedding space only.",
+    "authors": [
+      "Guy Dar",
+      "Mor Geva",
+      "Ankit Gupta",
+      "Jonathan Berant"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.893",
+    "point2d": [
+      -42.6081657409668,
+      -30.77349090576172
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 895,
+    "title": "Few-Shot Data-to-Text Generation via Unified Representation and Multi-Source Learning",
+    "abstract": "In this paper, we present a novel approach for data-to-text generation that addresses the limitations of current methods that primarily focus on specific types of structured data. Our proposed method aims to improve performance in multi-task training, zero-shot and few-shot scenarios by providing a unified representation that can handle various forms of structured data such as tables, knowledge graph triples, and meaning representations. We demonstrate that our proposed approach can effectively adapt to new structured forms, and can improve performance in comparison to current methods. For example, our method resulted in a 66% improvement in zero-shot BLEU scores when transferring models trained on table inputs to a knowledge graph dataset. Our proposed method is an important step towards a more general data-to-text generation framework.",
+    "authors": [
+      "Alexander Hanbo Li",
+      "Mingyue Shang",
+      "Evangelia Spiliopoulou",
+      "Jie Ma",
+      "Patrick Ng",
+      "Zhiguo Wang",
+      "Bonan Min",
+      "William Yang Wang",
+      "Kathleen McKeown",
+      "Vittorio Castelli",
+      "Dan Roth",
+      "Bing Xiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.894",
+    "point2d": [
+      -6.772817134857178,
+      -45.76295852661133
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 896,
+    "title": "FactKG: Fact Verification via Reasoning on Knowledge Graphs",
+    "abstract": "In real world applications, knowledge graphs (KG) are widely used in various domains (e.g. medical applications and dialogue agents). However, for fact verification, KGs have not been adequately utilized as a knowledge source. KGs can be a valuable knowledge source in fact verification due to their reliability and broad applicability. A KG consists of nodes and edges which makes it clear how concepts are linked together, allowing machines to reason over chains of topics. However, there are many challenges in understanding how these machine-readable concepts map to information in text. To enable the community to better use KGs, we introduce a new dataset, FactKG: Fact Verificationvia Reasoning on Knowledge Graphs. It consists of 108k natural language claims with five types of reasoning: One-hop, Conjunction, Existence, Multi-hop, and Negation. Furthermore, FactKG contains various linguistic patterns, including colloquial style claims as well as written style claims to increase practicality. Lastly, we develop a baseline approach and analyze FactKG over these reasoning types. We believe FactKG can advance both reliability and practicality in KG-based fact verification.",
+    "authors": [
+      "Jiho Kim",
+      "Sungjin Park",
+      "Yeonsu Kwon",
+      "Yohan Jo",
+      "James Thorne",
+      "Edward Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.895",
+    "point2d": [
+      63.93739318847656,
+      -10.588224411010742
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 897,
+    "title": "DrBERT: A Robust Pre-trained Model in French for Biomedical and Clinical domains",
+    "abstract": "In recent years, pre-trained language models (PLMs) achieve the best performance on a wide range of natural language processing (NLP) tasks. While the first models were trained on general domain data, specialized ones have emerged to more effectively treat specific domains. In this paper, we propose an original study of PLMs in the medical domain on French language. We compare, for the first time, the performance of PLMs trained on both public data from the web and private data from healthcare establishments. We also evaluate different learning strategies on a set of biomedical tasks. In particular, we show that we can take advantage of already existing biomedical PLMs in a foreign language by further pre-train it on our targeted data. Finally, we release the first specialized PLMs for the biomedical field in French, called DrBERT, as well as the largest corpus of medical data under free license on which these models are trained.",
+    "authors": [
+      "Yanis Labrak",
+      "Adrien Bazoge",
+      "Richard Dufour",
+      "Mickael Rouvier",
+      "Emmanuel Morin",
+      "B\u00e9atrice Daille",
+      "Pierre-Antoine Gourraud"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.896",
+    "point2d": [
+      31.151493072509766,
+      -35.7975959777832
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 898,
+    "title": "Discriminative Reasoning with Sparse Event Representation for Document-level Event-Event Relation Extraction",
+    "abstract": "Document-level Event Causality Identification (DECI) aims to extract causal relations between events in a document. It challenges conventional sentence-level task (SECI) with difficult long-text understanding. In this paper, we propose a novel DECI model (SENDIR) for better document-level reasoning. Different from existing works that build an event graph via linguistic tools, SENDIR does not require any prior knowledge. The basic idea is to discriminate event pairs in the same sentence or span multiple sentences by assuming their different information density: 1) low density in the document suggests sparse attention to skip irrelevant information. Our module 1 designs various types of attention for event representation learning to capture long-distance dependence. 2) High density in a sentence makes SECI relatively easy. Module 2 uses different weights to highlight the roles and contributions of intra- and inter-sentential reasoning, which introduces supportive event pairs for joint modeling. Extensive experiments demonstrate great improvements in SENDIR and the effectiveness of various sparse attention for document-level representations. Codes will be released later.",
+    "authors": [
+      "Changsen Yuan",
+      "Heyan Huang",
+      "Yixin Cao",
+      "Yonggang Wen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.897",
+    "point2d": [
+      49.26313400268555,
+      -48.248931884765625
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 899,
+    "title": "Facilitating Fine-grained Detection of Chinese Toxic Language: Hierarchical Taxonomy, Resources, and Benchmarks",
+    "abstract": "The widespread dissemination of toxic online posts is increasingly damaging to society. However, research on detecting toxic language in Chinese has lagged significantly due to limited datasets. Existing datasets suffer from a lack of fine-grained annotations, such as the toxic type and expressions with indirect toxicity. These fine-grained annotations are crucial factors for accurately detecting the toxicity of posts involved with lexical knowledge, which has been a challenge for researchers. To tackle this problem, we facilitate the fine-grained detection of Chinese toxic language by building a new dataset with benchmark results. First, we devised Monitor Toxic Frame, a hierarchical taxonomy to analyze the toxic type and expressions. Then, we built a fine-grained dataset ToxiCN, including both direct and indirect toxic samples. ToxiCN is based on an insulting vocabulary containing implicit profanity. We further propose a benchmark model, Toxic Knowledge Enhancement (TKE), by incorporating lexical features to detect toxic language. We demonstrate the usability of ToxiCN and the effectiveness of TKE based on a systematic quantitative and qualitative analysis.",
+    "authors": [
+      "Junyu Lu",
+      "Bo Xu",
+      "Xiaokun Zhang",
+      "Changrong Min",
+      "Liang Yang",
+      "Hongfei Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.898",
+    "point2d": [
+      9.008051872253418,
+      16.14124298095703
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 900,
+    "title": "SpeechMatrix: A Large-Scale Mined Corpus of Multilingual Speech-to-Speech Translations",
+    "abstract": "We present SpeechMatrix, a large-scale multilingual corpus of speech-to-speech translations mined from real speech of European Parliament recordings. It contains speech alignments in 136 language pairs with a total of 418 thousand hours of speech. To evaluate the quality of this parallel speech, we train bilingual speech-to-speech translation models on mined data only and establish extensive baseline results on EuroParl-ST, VoxPopuli and FLEURS test sets. Enabled by the multilinguality of SpeechMatrix, we also explore multilingual speech-to-speech translation, a topic which was addressed by few other works. We also demonstrate that model pre-training and sparse scaling using Mixture-of-Experts bring large gains to translation performance. The mined data and models will be publicly released",
+    "authors": [
+      "Paul-Ambroise Duquenne",
+      "Hongyu Gong",
+      "Ning Dong",
+      "Jingfei Du",
+      "Ann Lee",
+      "Vedanuj Goswami",
+      "Changhan Wang",
+      "Juan Pino",
+      "Beno\u00eet Sagot",
+      "Holger Schwenk"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.899",
+    "point2d": [
+      -70.28660583496094,
+      18.102649688720703
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 901,
+    "title": "Character-Aware Models Improve Visual Text Rendering",
+    "abstract": "Current image generation models struggle to reliably produce well-formed visual text. In this paper, we investigate a key contributing factor: popular text-to-image models lack character-level input features, making it much harder to predict a word\u2019s visual makeup as a series of glyphs. To quantify this effect, we conduct a series of experiments comparing character-aware vs. character-blind text encoders. In the text-only domain, we find that character-aware models provide large gains on a novel spelling task (WikiSpell). Applying our learnings to the visual domain, we train a suite of image generation models, and show that character-aware variants outperform their character-blind counterparts across a range of novel text rendering tasks (our DrawText benchmark). Our models set a much higher state-of-the-art on visual spelling, with 30+ point accuracy gains over competitors on rare words, despite training on far fewer examples.",
+    "authors": [
+      "Rosanne Liu",
+      "Dan Garrette",
+      "Chitwan Saharia",
+      "William Chan",
+      "Adam Roberts",
+      "Sharan Narang",
+      "Irina Blok",
+      "Rj Mical",
+      "Mohammad Norouzi",
+      "Noah Constant"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.900",
+    "point2d": [
+      -63.58209228515625,
+      50.473670959472656
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 902,
+    "title": "IDRISI-RA: The First Arabic Location Mention Recognition Dataset of Disaster Tweets",
+    "abstract": "Extracting geolocation information from social media data enables effective disaster management, as it helps response authorities; for example, in locating incidents for planning rescue activities, and affected people for evacuation. Nevertheless, geolocation extraction is greatly understudied for the low resource languages such as Arabic. To fill this gap, we introduce IDRISI-RA, the first publicly-available Arabic Location Mention Recognition (LMR) dataset that provides human- and automatically-labeled versions in order of thousands and millions of tweets, respectively. It contains both location mentions and their types (e.g., district, city). Our extensive analysis shows the decent geographical, domain, location granularity, temporal, and dialectical coverage of IDRISI-RA. Furthermore, we establish baselines using the standard Arabic NER models and build two simple, yet effective, LMR models. Our rigorous experiments confirm the need for developing specific models for Arabic LMR in the disaster domain. Moreover, experiments show the promising domain and geographical generalizability of IDRISI-RA under zero-shot learning.",
+    "authors": [
+      "Reem Suwaileh",
+      "Muhammad Imran",
+      "Tamer Elsayed"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.901",
+    "point2d": [
+      25.746204376220703,
+      -83.63877868652344
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 903,
+    "title": "FSUIE: A Novel Fuzzy Span Mechanism for Universal Information Extraction",
+    "abstract": "Universal Information Extraction (UIE) has been introduced as a unified framework for various Information Extraction (IE) tasks and has achieved widespread success. Despite this, UIE models have limitations. For example, they rely heavily on span boundaries in the data during training, which does not reflect the reality of span annotation challenges. Slight adjustments to positions can also meet requirements. Additionally, UIE models lack attention to the limited span length feature in IE. To address these deficiencies, we propose the Fuzzy Span Universal Information Extraction (FSUIE) framework. Specifically, our contribution consists of two concepts: fuzzy span loss and fuzzy span attention. Our experimental results on a series of main IE tasks show significant improvement compared to the baseline, especially in terms of fast convergence and strong performance with small amounts of data and training epochs. These results demonstrate the effectiveness and generalization of FSUIE in different tasks, settings, and scenarios.",
+    "authors": [
+      "Tianshuo Peng",
+      "Zuchao Li",
+      "Lefei Zhang",
+      "Bo Du",
+      "Hai Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.902",
+    "point2d": [
+      32.29600143432617,
+      -59.318599700927734
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 904,
+    "title": "What Do NLP Researchers Believe? Results of the NLP Community Metasurvey",
+    "abstract": "We present the results of the NLP Community Metasurvey. Run from May to June 2022, it elicited opinions on controversial issues, including industry influence in the field, concerns about AGI, and ethics. Our results put concrete numbers to several controversies: For example, respondents are split in half on the importance of artificial general intelligence, whether language models understand language, and the necessity of linguistic structure and inductive bias for solving NLP problems. In addition, the survey posed meta-questions, asking respondents to predict the distribution of survey responses. This allows us to uncover false sociological beliefs where the community\u2019s predictions don\u2019t match reality. Among other results, we find that the community greatly overestimates its own belief in the usefulness of benchmarks and the potential for scaling to solve real-world problems, while underestimating its belief in the importance of linguistic structure, inductive bias, and interdisciplinary science.",
+    "authors": [
+      "Julian Michael",
+      "Ari Holtzman",
+      "Alicia Parrish",
+      "Aaron Mueller",
+      "Alex Wang",
+      "Angelica Chen",
+      "Divyam Madaan",
+      "Nikita Nangia",
+      "Richard Yuanzhe Pang",
+      "Jason Phang",
+      "Samuel R. Bowman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.903",
+    "point2d": [
+      21.023212432861328,
+      14.753447532653809
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 905,
+    "title": "Prototype-Guided Pseudo Labeling for Semi-Supervised Text Classification",
+    "abstract": "Semi-supervised text classification (SSTC) aims at text classification with few labeled data and massive unlabeled data. Recent works achieve this task by pseudo-labeling methods, with the belief that the unlabeled and labeled data have identical data distribution, and assign the unlabeled data with pseudo-labels as additional supervision. However, existing pseudo-labeling methods usually suffer from ambiguous categorical boundary issues when training the pseudo-labeling phase, and simply select pseudo-labels without considering the unbalanced categorical distribution of the unlabeled data, making it difficult to generate reliable pseudo-labels for each category. We propose a novel semi-supervised framework, namely ProtoS2, with prototypical cluster separation (PCS) and prototypical-center data selection (CDS) technology to address the issue. Particularly, PCS exploits categorical prototypes to assimilate instance representations within the same category, thus emphasizing low-density separation for the pseudo-labeled data to alleviate ambiguous boundaries. Besides, CDS selects central pseudo-labeled data considering the categorical distribution, avoiding the model from biasing on dominant categories. Empirical studies and extensive analysis with four benchmarks demonstrate the effectiveness of the proposed model.",
+    "authors": [
+      "Weiyi Yang",
+      "Richong Zhang",
+      "Junfan Chen",
+      "Lihong Wang",
+      "Jaein Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.904",
+    "point2d": [
+      -1.773980736732483,
+      -24.202983856201172
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 906,
+    "title": "LENS: A Learnable Evaluation Metric for Text Simplification",
+    "abstract": "Training learnable metrics using modern language models has recently emerged as a promising method for the automatic evaluation of machine translation. However, existing human evaluation datasets for text simplification have limited annotations that are based on unitary or outdated models, making them unsuitable for this approach. To address these issues, we introduce the SimpEval corpus that contains: SimpEval_past, comprising 12K human ratings on 2.4K simplifications of 24 past systems, and SimpEval_2022, a challenging simplification benchmark consisting of over 1K human ratings of 360 simplifications including GPT-3.5 generated text. Training on SimpEval, we present LENS, a Learnable Evaluation Metric for Text Simplification. Extensive empirical results show that LENS correlates much better with human judgment than existing metrics, paving the way for future progress in the evaluation of text simplification. We also introduce Rank & Rate, a human evaluation framework that rates simplifications from several models in a list-wise manner using an interactive interface, which ensures both consistency and accuracy in the evaluation process and is used to create the SimpEval datasets.",
+    "authors": [
+      "Mounica Maddela",
+      "Yao Dou",
+      "David Heineman",
+      "Wei Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.905",
+    "point2d": [
+      -32.49061965942383,
+      28.84471893310547
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 907,
+    "title": "MeetingBank: A Benchmark Dataset for Meeting Summarization",
+    "abstract": "As the number of recorded meetings increases, it becomes increasingly important to utilize summarization technology to create useful summaries of these recordings. However, there is a crucial lack of annotated meeting corpora for developing this technology, as it can be hard to collect meetings, especially when the topics discussed are confidential. Furthermore, meeting summaries written by experienced writers are scarce, making it hard for abstractive summarizers to produce sensible output without a reliable reference. This lack of annotated corpora has hindered the development of meeting summarization technology. In this paper, we present MeetingBank, a new benchmark dataset of city council meetings over the past decade. MeetingBank is unique among other meeting corpora due to its divide-and-conquer approach, which involves dividing professionally written meeting minutes into shorter passages and aligning them with specific segments of the meeting. This breaks down the process of summarizing a lengthy meeting into smaller, more manageable tasks. The dataset provides a new testbed of various meeting summarization systems and also allows the public to gain insight into how council decisions are made. We make the collection, including meeting video links, transcripts, reference summaries, agenda, and other metadata, publicly available to facilitate the development of better meeting summarization techniques.",
+    "authors": [
+      "Yebowen Hu",
+      "Timothy Ganter",
+      "Hanieh Deilamsalehy",
+      "Franck Dernoncourt",
+      "Hassan Foroosh",
+      "Fei Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.906",
+    "point2d": [
+      -12.038336753845215,
+      44.530479431152344
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 908,
+    "title": "UniEX: An Effective and Efficient Framework for Unified Information Extraction via a Span-extractive Perspective",
+    "abstract": "We propose a new paradigm for universal information extraction (IE) that is compatible with any schema format and applicable to a list of IE tasks, such as named entity recognition, relation extraction, event extraction and sentiment analysis. Our approach converts the text-based IE tasks as the token-pair problem, which uniformly disassembles all extraction targets into joint span detection, classification and association problems with a unified extractive framework, namely UniEX. UniEX can synchronously encode schema-based prompt and textual information, and collaboratively learn the generalized knowledge from pre-defined information using the auto-encoder language models. We develop a traffine attention mechanism to integrate heterogeneous factors including tasks, labels and inside tokens, and obtain the extraction target via a scoring matrix. Experiment results show that UniEX can outperform generative universal IE models in terms of performance and inference-speed on 14 benchmarks IE datasets with the supervised setting. The state-of-the-art performance in low-resource scenarios also verifies the transferability and effectiveness of UniEX.",
+    "authors": [
+      "Yang Ping",
+      "JunYu Lu",
+      "Ruyi Gan",
+      "Junjie Wang",
+      "Yuxiang Zhang",
+      "Pingjian Zhang",
+      "Jiaxing Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.907",
+    "point2d": [
+      32.84253692626953,
+      -60.14441680908203
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 909,
+    "title": "DEplain: A German Parallel Corpus with Intralingual Translations into Plain Language for Sentence and Document Simplification",
+    "abstract": "Text simplification is an intralingual translation task in which documents, or sentences of a complex source text are simplified for a target audience. The success of automatic text simplification systems is highly dependent on the quality of parallel data used for training and evaluation. To advance sentence simplification and document simplification in German, this paper presents DEplain, a new dataset of parallel, professionally written and manually aligned simplifications in plain German \u201cplain DE\u201d or in German: \u201cEinfache Sprache\u201d. DEplain consists of a news-domain (approx. 500 document pairs, approx. 13k sentence pairs) and a web-domain corpus (approx. 150 aligned documents, approx. 2k aligned sentence pairs). In addition, we are building a web harvester and experimenting with automatic alignment methods to facilitate the integration of non-aligned and to be-published parallel documents. Using this approach, we are dynamically increasing the web-domain corpus, so it is currently extended to approx. 750 document pairs and approx. 3.5k aligned sentence pairs. We show that using DEplain to train a transformer-based seq2seq text simplification model can achieve promising results. We make available the corpus, the adapted alignment methods for German, the web harvester and the trained models here: https://github.com/rstodden/DEPlain.",
+    "authors": [
+      "Regina Stodden",
+      "Omar Momen",
+      "Laura Kallmeyer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.908",
+    "point2d": [
+      -33.40620803833008,
+      27.572986602783203
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 910,
+    "title": "A Neural Divide-and-Conquer Reasoning Framework for Image Retrieval from Linguistically Complex Text",
+    "abstract": "Pretrained Vision-Language Models (VLMs) have achieved remarkable performance in image retrieval from text. However, their performance drops drastically when confronted with linguistically complex texts that they struggle to comprehend. Inspired by the Divide-and-Conquer algorithm and dual-process theory, in this paper, we regard linguistically complex texts as compound proposition texts composed of multiple simple proposition sentences and propose an end-to-end Neural Divide-and-Conquer Reasoning framework, dubbed NDCR. It contains three main components: 1) Divide: a proposition generator divides the compound proposition text into simple proposition sentences and produces their corresponding representations, 2) Conquer: a pretrained VLMs-based visual-linguistic interactor achieves the interaction between decomposed proposition sentences and images, 3) Combine: a neural-symbolic reasoner combines the above reasoning states to obtain the final solution via a neural logic reasoning approach. According to the dual-process theory, the visual-linguistic interactor and neural-symbolic reasoner could be regarded as analogical reasoning System 1 and logical reasoning System 2. We conduct extensive experiments on a challenging image retrieval from contextual descriptions data set. Experimental results and analyses indicate NDCR significantly improves performance in the complex image-text reasoning problem.",
+    "authors": [
+      "Yunxin Li",
+      "Baotian Hu",
+      "Yuxin Ding",
+      "Lin Ma",
+      "Min Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.909",
+    "point2d": [
+      -49.295597076416016,
+      38.549015045166016
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 911,
+    "title": "RARR: Researching and Revising What Language Models Say, Using Language Models",
+    "abstract": "Language models (LMs) now excel at many tasks such as question answering, reasoning, and dialog. However, they sometimes generate unsupported or misleading content. A user cannot easily determine whether their outputs are trustworthy or not, because most LMs do not have any built-in mechanism for attribution to external evidence. To enable attribution while still preserving all the powerful advantages of recent generation models, we propose RARR (Retrofit Attribution using Research and Revision), a system that 1) automatically finds attribution for the output of any text generation model, and 2) post-edits the output to fix unsupported content while preserving the original output as much as possible. When applied to the output of several state-of-the-art LMs on a diverse set of generation tasks, we find that RARR significantly improves attribution while otherwise preserving the original input to a much greater degree than previously explored edit models. Furthermore, the implementation of RARR requires only a handful of training examples, a large language model, and standard web search.",
+    "authors": [
+      "Luyu Gao",
+      "Zhuyun Dai",
+      "Panupong Pasupat",
+      "Anthony Chen",
+      "Arun Tejasvi Chaganty",
+      "Yicheng Fan",
+      "Vincent Zhao",
+      "Ni Lao",
+      "Hongrae Lee",
+      "Da-Cheng Juan",
+      "Kelvin Guu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "long",
+    "doi": "10.18653/v1/2023.acl-long.910",
+    "point2d": [
+      -10.257352828979492,
+      14.635920524597168
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 912,
+    "title": "Should you marginalize over possible tokenizations?",
+    "abstract": "Autoregressive language models (LMs) map token sequences to probabilities. The usual practice for computing the probability of any character string (e.g. English sentences) is to first transform it into a sequence of tokens that is scored by the model. However, there are exponentially many token sequences that represent any given string. To truly compute the probability of a string one should marginalize over all tokenizations, which is typically intractable. Here, we analyze whether the practice of ignoring the marginalization is justified. To this end, we devise an importance-sampling-based algorithm that allows us to compute estimates of the marginal probabilities and compare them to the default procedure in a range of state-of-the-art models and datasets. Our results show that the gap in log-likelihood is no larger than 0.5% in most cases, but that it becomes more pronounced for data with long complex words.",
+    "authors": [
+      "Nadezhda Chirkova",
+      "Germ\u00e1n Kruszewski",
+      "Jos Rozen",
+      "Marc Dymetman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.1",
+    "point2d": [
+      -38.79295349121094,
+      -2.3781018257141113
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 913,
+    "title": "Back to Patterns: Efficient Japanese Morphological Analysis with Feature-Sequence Trie",
+    "abstract": "Accurate neural models are much less efficient than non-neural models and are useless for processing billions of social media posts or handling user queries in real time with a limited budget. This study revisits the fastest pattern-based NLP methods to make them as accurate as possible, thus yielding a strikingly simple yet surprisingly accurate morphological analyzer for Japanese. The proposed method induces reliable patterns from a morphological dictionary and annotated data. Experimental results on two standard datasets confirm that the method exhibits comparable accuracy to learning-based baselines, while boasting a remarkable throughput of over 1,000,000 sentences per second on a single modern CPU. The source code is available at https://www.tkl.iis.u-tokyo.ac.jp/ ynaga/jagger/",
+    "authors": [
+      "Naoki Yoshinaga"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.2",
+    "point2d": [
+      -40.52116012573242,
+      -40.88238525390625
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 914,
+    "title": "Transformed Protoform Reconstruction",
+    "abstract": "Protoform reconstruction is the task of inferring what morphemes or words appeared like in the ancestral languages of a set of daughter languages. Meloni et al (2021) achieved the state-of-the-art on Latin protoform reconstruction with an RNN-based encoder-decoder with attention model. We update their model with the state-of-the-art seq2seq model: the Transformer. Our model outperforms their model on a suite of different metrics on two different datasets: their Romance data of 8,000 cognates spanning 5 languages and a Chinese dataset (Hou 2004) of 800+ cognates spanning 39 varieties. We also probe our model for potential phylogenetic signal contained in the model. Our code is publicly available at https://github.com/cmu-llab/acl-2023.",
+    "authors": [
+      "Young Min Kim",
+      "Kalvin Chang",
+      "Chenxuan Cui",
+      "David R. Mortensen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.3",
+    "point2d": [
+      -40.0909309387207,
+      -46.595394134521484
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 915,
+    "title": "Ellipsis-Dependent Reasoning: a New Challenge for Large Language Models",
+    "abstract": "We propose a novel challenge for large language models: ellipsis-dependent reasoning. We define several structures of paired examples, where an ellipsis example is matched to its non-ellipsis counterpart, and a question is posed which requires resolution of the ellipsis. Test results show that the best models perform well on non-elliptical examples but struggle with all but the simplest ellipsis structures.",
+    "authors": [
+      "Daniel Hardt"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.4",
+    "point2d": [
+      -14.510537147521973,
+      -71.95985412597656
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 916,
+    "title": "Bootstrapping Neural Relation and Explanation Classifiers",
+    "abstract": "We introduce a method that self trains (or bootstraps) neural relation and explanation classifiers. Our work expands the supervised approach of CITATION, which jointly trains a relation classifier with an explanation classifier that identifies context words important for the relation at hand, to semi-supervised scenarios. In particular, our approach iteratively converts the explainable models\u2019 outputs to rules and applies them to unlabeled text to produce new annotations.Our evaluation on the TACRED dataset shows that our method outperforms the rule-based model we started from by 15 F1 points, outperforms traditional self-training that relies just on the relation classifier by 5 F1 points, and performs comparatively with the prompt-based approach of CITATION (without requiring an additional natural language inference component).",
+    "authors": [
+      "Zheng Tang",
+      "Mihai Surdeanu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.5",
+    "point2d": [
+      39.26613998413086,
+      -59.1003532409668
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 917,
+    "title": "A Fast Algorithm for Computing Prefix Probabilities",
+    "abstract": "Multiple algorithms are known for efficiently calculating the prefix probability of a string under a probabilistic context-free grammar (PCFG). Good algorithms for the problem have a runtime cubic in the length of the input string. However, some proposed algorithms are suboptimal with respect to the size of the grammar.This paper proposes a new speed-up of Jelinek and Lafferty\u2019s (1991) algorithm, which runs in O(n3|N|3 + |N|4), where n is the input length and |N| is the number of non-terminals in the grammar. In contrast, our speed-up runs in O(n2|N|3 + n3|N|2).",
+    "authors": [
+      "Franz Nowak",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.6",
+    "point2d": [
+      -22.653051376342773,
+      -58.33776092529297
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 918,
+    "title": "Analyzing Text Representations by Measuring Task Alignment",
+    "abstract": "Textual representations based on pre-trained language models are key, especially in few-shot learning scenarios. What makes a representation good for text classification? Is it due to the geometric properties of the space or because it is well aligned with the task? We hypothesize the second claim. To test it, we develop a task alignment score based on hierarchical clustering that measures alignment at different levels of granularity. Our experiments on text classification validate our hypothesis by showing that task alignment can explain the classification performance of a given representation.",
+    "authors": [
+      "Cesar Gonzalez-Gutierrez",
+      "Audi Primadhanty",
+      "Francesco Cazzaro",
+      "Ariadna Quattoni"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.7",
+    "point2d": [
+      -1.579097032546997,
+      -29.12262535095215
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 919,
+    "title": "Tracing Linguistic Markers of Influence in a Large Online Organisation",
+    "abstract": "Social science and psycholinguistic research have shown that power and status affect how people use language in a range of domains. Here, we investigate a similar question in a large, distributed, consensus-driven community with little traditional power hierarchy \u2013 the Internet Engineering Task Force (IETF), a collaborative organisation that designs internet standards. Our analysis based on lexical categories (LIWC) and BERT, shows that participants\u2019 levels of influence can be predicted from their email text, and identify key linguistic differences (e.g., certain LIWC categories, such as \u201cWE\u201d are positively correlated with high-influence). We also identify the differences in language use for the same person before and after becoming influential.",
+    "authors": [
+      "Prashant Khare",
+      "Ravi Shekhar",
+      "Mladen Karan",
+      "Stephen McQuistin",
+      "Colin Perkins",
+      "Ignacio Castro",
+      "Gareth Tyson",
+      "Patrick Healey",
+      "Matthew Purver"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.8",
+    "point2d": [
+      31.854488372802734,
+      31.546796798706055
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 920,
+    "title": "Metaphor Detection via Explicit Basic Meanings Modelling",
+    "abstract": "One noticeable trend in metaphor detection is the embrace of linguistic theories such as the metaphor identification procedure (MIP) for model architecture design. While MIP clearly defines that the metaphoricity of a lexical unit is determined based on the contrast between its contextual meaning and its basic meaning, existing work does not strictly follow this principle, typically using the aggregated meaning to approximate the basic meaning of target words. In this paper, we propose a novel metaphor detection method, which models the basic meaning of the word based on literal annotation from the training set, and then compares this with the contextual meaning in a target sentence to identify metaphors. Empirical results show that our method outperforms the state-of-the-art method significantly by 1.0% in F1 score. Moreover, our performance even reaches the theoretical upper bound on the VUA18 benchmark for targets with basic annotations, which demonstrates the importance of modelling basic meanings for metaphor detection.",
+    "authors": [
+      "Yucheng Li",
+      "Shun Wang",
+      "Chenghua Lin",
+      "Frank Guerin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.9",
+    "point2d": [
+      7.531775951385498,
+      -56.50359344482422
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 921,
+    "title": "xSIM++: An Improved Proxy to Bitext Mining Performance for Low-Resource Languages",
+    "abstract": "We introduce a new proxy score for evaluating bitext mining based on similarity in a multilingual embedding space: xsim++. In comparison to xsim, this improved proxy leverages rule-based approaches to extend English sentences in any evaluation set with synthetic, hard-to-distinguish examples which more closely mirror the scenarios we encounter during large-scale mining. We validate this proxy by running a significant number of bitext mining experiments for a set of low-resource languages, and subsequently train NMT systems on the mined data. In comparison to xsim, we show that xsim++ is better correlated with the downstream BLEU scores of translation systems trained on mined bitexts, providing a reliable proxy of bitext mining performance without needing to run expensive bitext mining pipelines. xsim++ also reports performance for different error types, offering more fine-grained feedbacks for model development.",
+    "authors": [
+      "Mingda Chen",
+      "Kevin Heffernan",
+      "Onur \u00c7elebi",
+      "Alexandre Mourachko",
+      "Holger Schwenk"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.10",
+    "point2d": [
+      -54.904510498046875,
+      -11.2777738571167
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 922,
+    "title": "Graph Propagation based Data Augmentation for Named Entity Recognition",
+    "abstract": "Data augmentation is an effective solution to improve model performance and robustness for low-resource named entity recognition (NER). However, synthetic data often suffer from poor diversity, which leads to performance limitations.In this paper, we propose a novel Graph Propagated Data Augmentation (GPDA) framework for Named Entity Recognition (NER), leveraging graph propagation to build relationships between labeled data and unlabeled natural texts. By projecting the annotations from the labeled text to the unlabeled text, the unlabeled texts are partially labeled, which has more diversity rather than synthetic annotated data.To strengthen the propagation precision, a simple search engine built on Wikipedia is utilized to fetch related texts of labeled data and to propagate the entity labels to them in the light of the anchor links. Besides, we construct and perform experiments on a real-world low-resource dataset of the E-commerce domain, which will be publicly available to facilitate the low-resource NER research.Experimental results show that GPDA presents substantial improvements over previous data augmentation methods on multiple low-resource NER datasets.",
+    "authors": [
+      "Jiong Cai",
+      "Shen Huang",
+      "Yong Jiang",
+      "Zeqi Tan",
+      "Pengjun Xie",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.11",
+    "point2d": [
+      33.48052215576172,
+      -82.88944244384766
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 923,
+    "title": "Dataset Distillation with Attention Labels for Fine-tuning BERT",
+    "abstract": "Dataset distillation aims to create a small dataset of informative synthetic samples to rapidly train neural networks that retain the performance of the original dataset. In this paper, we focus on constructing distilled few-shot datasets for natural language processing (NLP) tasks to fine-tune pre-trained transformers. Specifically, we propose to introduce attention labels, which can efficiently distill the knowledge from the original dataset and transfer it to the transformer models via attention probabilities. We evaluated our dataset distillation methods in four various NLP tasks and demonstrated that it is possible to create distilled few-shot datasets with the attention labels, yielding impressive performances for fine-tuning BERT. Specifically, in AGNews, a four-class news classification task, our distilled few-shot dataset achieved up to 93.2% accuracy, which is 98.5% performance of the original dataset even with only one sample per class and only one gradient step.",
+    "authors": [
+      "Aru Maekawa",
+      "Naoki Kobayashi",
+      "Kotaro Funakoshi",
+      "Manabu Okumura"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.12",
+    "point2d": [
+      -17.026451110839844,
+      -4.3772969245910645
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 924,
+    "title": "Multi-Document Summarization with Centroid-Based Pretraining",
+    "abstract": "In Multi-Document Summarization (MDS), the input can be modeled as a set of documents, and the output is its summary. In this paper, we focus on pretraining objectives for MDS. Specifically, we introduce a novel pretraining objective, which involves selecting the ROUGE-based centroid of each document cluster as a proxy for its summary. Our objective thus does not require human written summaries and can be utilized for pretraining on a dataset consisting solely of document sets. Through zero-shot, few-shot, and fully supervised experiments on multiple MDS datasets, we show that our model Centrum is better or comparable to a state-of-the-art model. We make the pretrained and fine-tuned models freely available to the research communityhttps://github.com/ratishsp/centrum.",
+    "authors": [
+      "Ratish Surendran Puduppully",
+      "Parag Jain",
+      "Nancy Chen",
+      "Mark Steedman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.13",
+    "point2d": [
+      -9.52676773071289,
+      41.488563537597656
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 925,
+    "title": "Scaling in Cognitive Modelling: a Multilingual Approach to Human Reading Times",
+    "abstract": "Neural language models are increasingly valued in computational psycholinguistics, due to their ability to provide conditional probability distributions over the lexicon that are predictive of human processing times. Given the vast array of available models, it is of both theoretical and methodological importance to assess what features of a model influence its psychometric quality. In this work we focus on parameter size, showing that larger Transformer-based language models generate probabilistic estimates that are less predictive of early eye-tracking measurements reflecting lexical access and early semantic integration. However, relatively bigger models show an advantage in capturing late eye-tracking measurements that reflect the full semantic and syntactic integration of a word into the current language context. Our results are supported by eye movement data in ten languages and consider four models, spanning from 564M to 4.5B parameters.",
+    "authors": [
+      "Andrea de Varda",
+      "Marco Marelli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.14",
+    "point2d": [
+      -44.45745086669922,
+      -9.45345401763916
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 926,
+    "title": "Improving Generalization in Language Model-based Text-to-SQL Semantic Parsing: Two Simple Semantic Boundary-based Techniques",
+    "abstract": "Compositional and domain generalization present significant challenges in semantic parsing, even for state-of-the-art semantic parsers based on pre-trained language models (LMs). In this study, we empirically investigate improving an LM\u2019s generalization in semantic parsing with two simple techniques: at the token level, we introduce a token preprocessing method to preserve the semantic boundaries of tokens produced by LM tokenizers; at the sequence level, we propose to use special tokens to mark the boundaries of components aligned between input and output. Our experimental results on two text-to-SQL semantic parsing datasets show that our token preprocessing, although simple, can substantially improve the LM performance on both types of generalization, and our component boundary marking method is particularly helpful for compositional generalization.",
+    "authors": [
+      "Daking Rai",
+      "Bailin Wang",
+      "Yilun Zhou",
+      "Ziyu Yao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.15",
+    "point2d": [
+      -29.319318771362305,
+      -58.401668548583984
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 927,
+    "title": "HiPool: Modeling Long Documents Using Graph Neural Networks",
+    "abstract": "Encoding long sequences in Natural Language Processing (NLP) is a challenging problem. Though recent pretraining language models achieve satisfying performances in many NLP tasks, they are still restricted by a pre-defined maximum length, making them challenging to be extended to longer sequences. So some recent works utilize hierarchies to model long sequences. However, most of them apply sequential models for upper hierarchies, suffering from long dependency issues. In this paper, we alleviate these issues through a graph-based method. We first chunk the sequence with a fixed length to model the sentence-level information. We then leverage graphs to model intra- and cross-sentence correlations with a new attention mechanism. Additionally, due to limited standard benchmarks for long document classification (LDC), we propose a new challenging benchmark, totaling six datasets with up to 53k samples and 4034 average tokens\u2019 length. Evaluation shows our model surpasses competitive baselines by 2.6% in F1 score, and 4.8% on the longest sequence dataset. Our method is shown to outperform hierarchical sequential models with better performance and scalability, especially for longer sequences.",
+    "authors": [
+      "Irene Li",
+      "Aosong Feng",
+      "Dragomir Radev",
+      "Rex Ying"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.16",
+    "point2d": [
+      -6.605660438537598,
+      -33.91691970825195
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 928,
+    "title": "A Weakly Supervised Classifier and Dataset of White Supremacist Language",
+    "abstract": "We present a dataset and classifier for detecting the language of white supremacist extremism, a growing issue in online hate speech. Our weakly supervised classifier is trained on large datasets of text from explicitly white supremacist domains paired with neutral and anti-racist data from similar domains. We demonstrate that this approach improves generalization performance to new domains. Incorporating anti-racist texts as counterexamples to white supremacist language mitigates bias.",
+    "authors": [
+      "Michael Yoder",
+      "Ahmad Diab",
+      "David Brown",
+      "Kathleen Carley"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.17",
+    "point2d": [
+      29.85991859436035,
+      23.687585830688477
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 929,
+    "title": "BOLT: Fast Energy-based Controlled Text Generation with Tunable Biases",
+    "abstract": "Energy-based models (EBMs) have gained popularity for controlled text generation due to their high applicability to a wide range of constraints. However, sampling from EBMs is non-trivial, as it often requires a large number of iterations to converge to plausible text, which slows down the decoding process and makes it less practical for real-world applications. In this work, we propose BOLT, which relies on tunable biases to directly adjust the language model\u2019s output logits. Unlike prior work, BOLT maintains the generator\u2019s autoregressive nature to assert a strong control on token-wise conditional dependencies and overall fluency, and thus converges faster. When compared with state-of-the-arts on controlled generation tasks using both soft constraints (e.g., sentiment control) and hard constraints (e.g., keyword-guided topic control), BOLT demonstrates significantly improved efficiency and fluency. On sentiment control, BOLT is 7x faster than competitive baselines, and more fluent in 74.4% of the evaluation samples according to human judges.",
+    "authors": [
+      "Xin Liu",
+      "Muhammad Khalifa",
+      "Lu Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.18",
+    "point2d": [
+      -28.28877067565918,
+      9.631608009338379
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 930,
+    "title": "mOKB6: A Multilingual Open Knowledge Base Completion Benchmark",
+    "abstract": "Automated completion of open knowledge bases (Open KBs), which are constructed from triples of the form (subject phrase, relation phrase, object phrase), obtained via open information extraction (Open IE) system, are useful for discovering novel facts that may not be directly present in the text. However, research in Open KB completion (Open KBC) has so far been limited to resource-rich languages like English. Using the latest advances in multilingual Open IE, we construct the first multilingual Open KBC dataset, called mOKB6, containing facts from Wikipedia in six languages (including English). Improvingthe previous Open KB construction pipeline by doing multilingual coreference resolution andkeeping only entity-linked triples, we create a dense Open KB. We experiment with several models for the task and observe a consistent benefit of combining languages with the help of shared embedding space as well as translations of facts. We also observe that current multilingual models struggle to remember facts seen in languages of different scripts.",
+    "authors": [
+      "Shubham Mittal",
+      "Keshav Kolluru",
+      "Soumen Chakrabarti",
+      "Mausam -"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.19",
+    "point2d": [
+      47.29073715209961,
+      -63.97040557861328
+    ],
+    "cluster": 25.0
+  },
+  {
+    "idx": 931,
+    "title": "Covering Uncommon Ground: Gap-Focused Question Generation for Answer Assessment",
+    "abstract": "Human communication often involves information gaps between the interlocutors. For example, in an educational dialogue a student often provides an answer that is incomplete, and there is a gap between this answer and the perfect one expected by the teacher. Successful dialogue then hinges on the teacher asking about this gap in an effective manner, thus creating a rich and interactive educational experience. We focus on the problem of generating such gap-focused questions (GFQs) automatically. We define the task, highlight key desired aspects of a good GFQ, and propose a model that satisfies these. Finally, we provide an evaluation by human annotators of our generated questions compared against human generated ones, demonstrating competitive performance.",
+    "authors": [
+      "Roni Rabin",
+      "Alexandre Djerbetian",
+      "Roee Engelberg",
+      "Lidan Hackmon",
+      "Gal Elidan",
+      "Reut Tsarfaty",
+      "Amir Globerson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.20",
+    "point2d": [
+      69.74972534179688,
+      11.600690841674805
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 932,
+    "title": "Detoxifying Text with MaRCo: Controllable Revision with Experts and Anti-Experts",
+    "abstract": "Text detoxification has the potential to mitigate the harms of toxicity by rephrasing text to remove offensive meaning, but subtle toxicity remains challenging to tackle. We introduce MaRCo, a detoxification algorithm that combines controllable generation and text rewriting methods using a Product of Experts with autoencoder language models (LMs). MaRCo uses likelihoods under a non-toxic LM (expert) and a toxic LM (anti-expert) to find candidate words to mask and potentially replace. We evaluate our method on several subtle toxicity and microaggressions datasets, and show that it not only outperforms baselines on automatic metrics, but MaRCo\u2019s rewrites are preferred 2.1 times more in human evaluation. Its applicability to instances of subtle toxicity is especially promising, demonstrating a path forward for addressing increasingly elusive online hate.",
+    "authors": [
+      "Skyler Hallinan",
+      "Alisa Liu",
+      "Yejin Choi",
+      "Maarten Sap"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.21",
+    "point2d": [
+      7.474602699279785,
+      15.845406532287598
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 933,
+    "title": "A Natural Bias for Language Generation Models",
+    "abstract": "After just a few hundred training updates, a standard probabilistic model for language generation has likely not yet learnt many semantic or syntactic rules of natural language, making it difficult to estimate the probability distribution over next tokens. Yet around this point, these models have identified a simple, loss-minimising behaviour: to output the unigram distribution of the target training corpus. The use of such a heuristic raises the question: Can we initialise our models with this behaviour and save precious compute resources and model capacity? Here we show that we can effectively endow standard neural language generation models with a separate module that reflects unigram frequency statistics as prior knowledge, simply by initialising the bias term in a model\u2019s final linear layer with the log-unigram distribution. We use neural machine translation as a test bed for this simple technique and observe that it: (i) improves learning efficiency; (ii) achieves better overall performance; and perhaps most importantly (iii) appears to disentangle strong frequency effects by encouraging the model to specialise in non-frequency-related aspects of language.",
+    "authors": [
+      "Clara Meister",
+      "Wojciech Stokowiec",
+      "Tiago Pimentel",
+      "Lei Yu",
+      "Laura Rimell",
+      "Adhiguna Kuncoro"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.22",
+    "point2d": [
+      -24.014009475708008,
+      11.665936470031738
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 934,
+    "title": "Simple Augmentations of Logical Rules for Neuro-Symbolic Knowledge Graph Completion",
+    "abstract": "High-quality and high-coverage rule sets are imperative to the success of Neuro-Symbolic Knowledge Graph Completion (NS-KGC) models, because they form the basis of all symbolic inferences. Recent literature builds neural models for generating rule sets, however, preliminary experiments show that they struggle with maintaining high coverage. In this work, we suggest three simple augmentations to existing rule sets: (1) transforming rules to their abductive forms, (2) generating equivalent rules that use inverse forms of constituent relations and (3) random walks that propose new rules. Finally, we prune potentially low quality rules. Experiments over four datasets and five ruleset-baseline settings suggest that these simple augmentations consistently improve results, and obtain up to 7.1 pt MRR and 8.5 pt Hits@1 gains over using rules without augmentations.",
+    "authors": [
+      "Ananjan Nandi",
+      "Navdeep Kaur",
+      "Parag Singla",
+      "Mausam -"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.23",
+    "point2d": [
+      55.83039474487305,
+      -60.110660552978516
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 935,
+    "title": "Parameter-efficient Weight Ensembling Facilitates Task-level Knowledge Transfer",
+    "abstract": "Recent studies show that large-scale pre-trained language models could be efficaciously adapted to particular tasks in a parameter-efficient manner. The trained lightweight set of parameters, such as adapters, can be easily stored and shared as a capability equipped with the corresponding models. Owning many lightweight parameters, we focus on transferring them between tasks to acquire an improvement in performance of new tasks, the key point of which is to obtain the similarity between tasks. In this paper, we explore 5 parameter-efficient weight ensembling methods to achieve such transferability and verify the effectiveness of them. These methods extract the information of datasets and trained lightweight parameters from different perspectives to obtain the similarity between tasks, and weight the existing lightweight parameters according to the comparability to acquire a suitable module for the initialization of new tasks. We apply them to three parameter-efficient tuning methods and test them on a wide set of downstream tasks. Experimental results show that our methods show an improvement of 5%~8% over baselines and could largely facilitate task-level knowledge transfer.",
+    "authors": [
+      "Xingtai Lv",
+      "Ning Ding",
+      "Yujia Qin",
+      "Zhiyuan Liu",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.24",
+    "point2d": [
+      -36.43976974487305,
+      -15.06789493560791
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 936,
+    "title": "Faithfulness Tests for Natural Language Explanations",
+    "abstract": "Explanations of neural models aim to reveal a model\u2019s decision-making process for its predictions. However, recent work shows that current methods giving explanations such as saliency maps or counterfactuals can be misleading, as they are prone to present reasons that are unfaithful to the model\u2019s inner workings. This work explores the challenging question of evaluating the faithfulness of natural language explanations (NLEs). To this end, we present two tests. First, we propose a counterfactual input editor for inserting reasons that lead to counterfactual predictions but are not reflected by the NLEs. Second, we reconstruct inputs from the reasons stated in the generated NLEs and check how often they lead to the same predictions. Our tests can evaluate emerging NLE models, proving a fundamental tool in the development of faithful NLEs.",
+    "authors": [
+      "Pepa Atanasova",
+      "Oana-Maria Camburu",
+      "Christina Lioma",
+      "Thomas Lukasiewicz",
+      "Jakob Grue Simonsen",
+      "Isabelle Augenstein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.25",
+    "point2d": [
+      27.612253189086914,
+      -9.097256660461426
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 937,
+    "title": "COGEN: Abductive Commonsense Language Generation",
+    "abstract": "Reasoning is one of the most important elements in achieving Artificial General Intelligence (AGI), specifically when it comes to Abductive and counterfactual reasoning. In order to introduce these capabilities of reasoning in Natural Language Processing (NLP) models, there have been recent advances towards training NLP models to better perform on two main tasks - Abductive Natural Language Inference (alphaNLI) and Abductive Natural Language Generation Task (alphaNLG). This paper proposes CoGen, a model for both alphaNLI and alphaNLG tasks that employ a novel approach of combining the temporal commonsense reasoning for each observation (before and after a real hypothesis) from pre-trained models with contextual filtering for training. Additionally, we use state-of-the-art semantic entailment to filter out the contradictory hypothesis during the inference. Our experimental results show that CoGen outperforms current models and set a new state of the art in regards to alphaNLI and alphaNLG tasks. We make the source code of CoGen model publicly available for reproducibility and to facilitate relevant future research.",
+    "authors": [
+      "Rohola Zandie",
+      "Diwanshu Shekhar",
+      "Mohammad Mahoor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.26",
+    "point2d": [
+      56.00025939941406,
+      -17.596071243286133
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 938,
+    "title": "Multimodal Relation Extraction with Cross-Modal Retrieval and Synthesis",
+    "abstract": "Multimodal relation extraction (MRE) is the task of identifying the semantic relationships between two entities based on the context of the sentence image pair. Existing retrieval-augmented approaches mainly focused on modeling the retrieved textual knowledge, but this may not be able to accurately identify complex relations. To improve the prediction, this research proposes to retrieve textual and visual evidence based on the object, sentence, and whole image. We further develop a novel approach to synthesize the object-level, image-level, and sentence-level information for better reasoning between the same and different modalities. Extensive experiments and analyses show that the proposed method is able to effectively select and compare evidence across modalities and significantly outperforms state-of-the-art models.",
+    "authors": [
+      "Xuming Hu",
+      "Zhijiang Guo",
+      "Zhiyang Teng",
+      "Irwin King",
+      "Philip S. Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.27",
+    "point2d": [
+      -48.63084411621094,
+      41.02821731567383
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 939,
+    "title": "Characterization of Stigmatizing Language in Medical Records",
+    "abstract": "Widespread disparities in clinical outcomes exist between different demographic groups in the United States. A new line of work in medical sociology has demonstrated physicians often use stigmatizing language in electronic medical records within certain groups, such as black patients, which may exacerbate disparities. In this study, we characterize these instances at scale using a series of domain-informed NLP techniques. We highlight important differences between this task and analogous bias-related tasks studied within the NLP community (e.g., classifying microaggressions). Our study establishes a foundation for NLP researchers to contribute timely insights to a problem domain brought to the forefront by recent legislation regarding clinical documentation transparency. We release data, code, and models.",
+    "authors": [
+      "Keith Harrigian",
+      "Ayah Zirikly",
+      "Brant Chee",
+      "Alya Ahmad",
+      "Anne Links",
+      "Somnath Saha",
+      "Mary Catherine Beach",
+      "Mark Dredze"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.28",
+    "point2d": [
+      29.98129653930664,
+      14.21888542175293
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 940,
+    "title": "Abstractive Summarizers are Excellent Extractive Summarizers",
+    "abstract": "Extractive and abstractive summarization designs have historically been fragmented, limiting the benefits that often arise from compatible model architectures. In this paper, we explore the potential synergies of modeling extractive summarization with an abstractive summarization system and propose three novel inference algorithms using the sequence-to-sequence architecture. We evaluate them on the CNN & Dailymail dataset and show that recent advancements in abstractive system designs enable abstractive systems to not only compete, but even surpass the performance of extractive systems with custom architectures. To our surprise, abstractive systems achieve this without being exposed to extractive oracle summaries and, therefore, for the first time allow a single model to produce both abstractive and extractive summaries. This evidence questions our fundamental understanding of extractive system design, and the necessity for extractive labels while pathing the way for promising research directions in hybrid models.",
+    "authors": [
+      "Daniel Varab",
+      "Yumo Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.29",
+    "point2d": [
+      -6.568226337432861,
+      43.31500244140625
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 941,
+    "title": "Language Models Get a Gender Makeover: Mitigating Gender Bias with Few-Shot Data Interventions",
+    "abstract": "Societal biases present in pre-trained large language models are a critical issue as these models have been shown to propagate biases in countless downstream applications, rendering them unfair towards specific groups of people. Since large-scale retraining of these models from scratch is both time and compute-expensive, a variety of approaches have been previously proposed that de-bias a pre-trained model. While the majority of current state-of-the-art debiasing methods focus on changes to the training regime, in this paper, we propose data intervention strategies as a powerful yet simple technique to reduce gender bias in pre-trained models. Specifically, we empirically show that by fine-tuning a pre-trained model on only 10 debiased (intervened) training examples, the tendency to favor any gender is significantly reduced. Since our proposed method only needs a few training examples, we argue that our few-shot de-biasing approach is highly feasible and practical. Through extensive experimentation, we show that our de-biasing technique performs better than competitive state-of-the-art baselines with minimal loss in language modeling ability.",
+    "authors": [
+      "Himanshu Thakur",
+      "Atishay Jain",
+      "Praneetha Vaddamanu",
+      "Paul Pu Liang",
+      "Louis-Philippe Morency"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.30",
+    "point2d": [
+      12.679422378540039,
+      29.021509170532227
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 942,
+    "title": "PLUE: Language Understanding Evaluation Benchmark for Privacy Policies in English",
+    "abstract": "Privacy policies provide individuals with information about their rights and how their personal information is handled. Natural language understanding (NLU) technologies can support individuals and practitioners to understand better privacy practices described in lengthy and complex documents. However, existing efforts that use NLU technologies are limited by processing the language in a way exclusive to a single task focusing on certain privacy practices. To this end, we introduce the Privacy Policy Language Understanding Evaluation (PLUE) benchmark, a multi-task benchmark for evaluating the privacy policy language understanding across various tasks. We also collect a large corpus of privacy policies to enable privacy policy domain-specific language model pre-training. We evaluate several generic pre-trained language models and continue pre-training them on the collected corpus. We demonstrate that domain-specific continual pre-training offers performance improvements across all tasks. The code and models are released at https://github.com/JFChi/PLUE.",
+    "authors": [
+      "Jianfeng Chi",
+      "Wasi Uddin Ahmad",
+      "Yuan Tian",
+      "Kai-Wei Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.31",
+    "point2d": [
+      -2.5698795318603516,
+      17.011486053466797
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 943,
+    "title": "Stop Pre-Training: Adapt Visual-Language Models to Unseen Languages",
+    "abstract": "Vision-Language Pre-training (VLP) has advanced the performance of many vision-language tasks, such as image-text retrieval, visual entailment, and visual reasoning.The pre-training mostly utilizes lexical databases and image queries in English. Previous work has demonstrated that the pre-training in English does not transfer well to other languages in a zero-shot setting. However, multilingual pre-trained language models (MPLM) have excelled at a variety of single-modal language tasks. In this paper, we propose a simple yet efficient approach to adapt VLP to unseen languages using MPLM.We utilize a cross-lingual contextualised token embeddings alignment approach to train text encoders for non-English languages. Our approach does not require image input and primarily uses machine translation, eliminating the need for target language data. Our evaluation across three distinct tasks (image-text retrieval, visual entailment, and natural language visual reasoning) demonstrates that this approach outperforms the state-of-the-art multilingual vision-language models without requiring large parallel corpora. Our code is available at https://github.com/Yasminekaroui/CliCoTea.",
+    "authors": [
+      "Yasmine Karoui",
+      "R\u00e9mi Lebret",
+      "Negar Foroutan Eghlidi",
+      "Karl Aberer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.32",
+    "point2d": [
+      -55.45515060424805,
+      35.92132568359375
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 944,
+    "title": "BUCA: A Binary Classification Approach to Unsupervised Commonsense Question Answering",
+    "abstract": "Unsupervised commonsense reasoning (UCR) is becoming increasingly popular as the construction of commonsense reasoning datasets is expensive, and they are inevitably limited in their scope. A popular approach to UCR is to fine-tune language models with external knowledge (e.g., knowledge graphs), but this usually requires a large number of training examples. In this paper, we propose to transform the downstream multiple choice question answering task into a simpler binary classification task by ranking all candidate answers according to their reasonableness. To this end, for training the model, we convert the knowledge graph triples into reasonable and unreasonable texts. Extensive experimental results show the effectiveness of our approach on various multiple choice question answering benchmarks. Furthermore, compared with existing UCR approaches using KGs, ours is less data hungry.",
+    "authors": [
+      "Jie He",
+      "Simon U",
+      "Victor Gutierrez-Basulto",
+      "Jeff Pan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.33",
+    "point2d": [
+      69.21261596679688,
+      -0.8983586430549622
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 945,
+    "title": "Nichelle and Nancy: The Influence of Demographic Attributes and Tokenization Length on First Name Biases",
+    "abstract": "Through the use of first name substitution experiments, prior research has demonstrated the tendency of social commonsense reasoning models to systematically exhibit social biases along the dimensions of race, ethnicity, and gender (An et al., 2023). Demographic attributes of first names, however, are strongly correlated with corpus frequency and tokenization length, which may influence model behavior independent of or in addition to demographic factors. In this paper, we conduct a new series of first name substitution experiments that measures the influence of these factors while controlling for the others. We find that demographic attributes of a name (race, ethnicity, and gender) and name tokenization length are both factors that systematically affect the behavior of social commonsense reasoning models.",
+    "authors": [
+      "Haozhe An",
+      "Rachel Rudinger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.34",
+    "point2d": [
+      22.958539962768555,
+      30.62846565246582
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 946,
+    "title": "Improving Syntactic Probing Correctness and Robustness with Control Tasks",
+    "abstract": "Syntactic probing methods have been used to examine whether and how pre-trained language models (PLMs) encode syntactic features. However, the probing methods are usually biased by the PLMs\u2019 memorization of common word co-occurrences, even if they do not form syntactic relations. This paper presents a random-word-substitution and random-label-matching control task to reduce these biases and improve the robustness of syntactic probing methods. Our control tasks are also shown to notably improve the consistency of probing results between different probing methods and make the methods more robust with respect to the text attributes of the probing instances. Our control tasks make syntactic probing methods better at reconstructing syntactic features and more generalizable to unseen text domains. Our experiments show that our proposed control tasks are effective on different PLMs, probing methods, and syntactic features.",
+    "authors": [
+      "Weicheng Ma",
+      "Brian Wang",
+      "Hefan Zhang",
+      "Lili Wang",
+      "Rolando Coto-Solano",
+      "Saeed Hassanpour",
+      "Soroush Vosoughi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.35",
+    "point2d": [
+      -21.530048370361328,
+      -64.5465087890625
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 947,
+    "title": "Split-NER: Named Entity Recognition via Two Question-Answering-based Classifications",
+    "abstract": "In this work, we address the NER problem by splitting it into two logical sub-tasks: (1) Span Detection which simply extracts entity mention spans irrespective of entity type; (2) Span Classification which classifies the spans into their entity types. Further, we formulate both sub-tasks as question-answering (QA) problems and produce two leaner models which can be optimized separately for each sub-task. Experiments with four cross-domain datasets demonstrate that this two-step approach is both effective and time efficient. Our system, SplitNER outperforms baselines on OntoNotes5.0, WNUT17 and a cybersecurity dataset and gives on-par performance on BioNLP13CG. In all cases, it achieves a significant reduction in training time compared to its QA baseline counterpart. The effectiveness of our system stems from fine-tuning the BERT model twice, separately for span detection and classification. The source code can be found at https://github.com/c3sr/split-ner.",
+    "authors": [
+      "Jatin Arora",
+      "Youngja Park"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.36",
+    "point2d": [
+      30.719459533691406,
+      -80.71558380126953
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 948,
+    "title": "Credible without Credit: Domain Experts Assess Generative Language Models",
+    "abstract": "Language models have recently broken into the public consciousness with the release of the wildly popular ChatGPT. Commentators have argued that language models could replace search engines, make college essays obsolete, or even write academic research papers. All of these tasks rely on accuracy of specialized information which can be difficult to assess for non-experts. Using 10 domain experts across science and culture, we provide an initial assessment of the coherence, conciseness, accuracy, and sourcing of two language models across 100 expert-written questions. While we find the results are consistently cohesive and concise, we find that they are mixed in their accuracy. These results raise questions of the role language models should play in general-purpose and expert knowledge seeking.",
+    "authors": [
+      "Denis Peskoff",
+      "Brandon Stewart"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.37",
+    "point2d": [
+      22.630775451660156,
+      14.791394233703613
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 949,
+    "title": "Grokking of Hierarchical Structure in Vanilla Transformers",
+    "abstract": "For humans, language production and comprehension is sensitive to the hierarchical structure of sentences. In natural language processing, past work has questioned how effectively neural sequence models like transformers capture this hierarchical structure when generalizing to structurally novel inputs. We show that transformer language models can learn to generalize hierarchically after training for extremely long periods\u2014far beyond the point when in-domain accuracy has saturated. We call this phenomenon structural grokking. On multiple datasets, structural grokking exhibits inverted U-shaped scaling in model depth: intermediate-depth models generalize better than both very deep and very shallow transformers. When analyzing the relationship between model-internal properties and grokking, we find that optimal depth for grokking can be identified using the tree-structuredness metric of CITATION. Overall, our work provides strong evidence that, with extended training, vanilla transformers discover and use hierarchical structure.",
+    "authors": [
+      "Shikhar Murty",
+      "Pratyusha Sharma",
+      "Jacob Andreas",
+      "Christopher Manning"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.38",
+    "point2d": [
+      -25.430400848388672,
+      -45.99274444580078
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 950,
+    "title": "Zero-shot Cross-lingual Transfer With Learned Projections Using Unlabeled Target-Language Data",
+    "abstract": "Adapters have emerged as a parameter-efficient Transformer-based framework for cross-lingual transfer by inserting lightweight language-specific modules (language adapters) and task-specific modules (task adapters) within pretrained multilingual models. Zero-shot transfer is enabled by pairing the language adapter in the target language with an appropriate task adapter in a source language. If our target languages are known apriori, we explore how zero-shot transfer can be further improved within the adapter framework by utilizing unlabeled text during task-specific finetuning. We construct language-specific subspaces using standard linear algebra constructs and selectively project source-language representations into the target language subspace during task-specific finetuning using two schemes. Our experiments on three cross-lingual tasks, Named Entity Recognition (NER), Question Answering (QA) and Natural Language Inference (NLI) yield consistent benefits compared to adapter baselines over a wide variety of target languages with up to 11% relative improvement in NER, 2% relative improvement in QA and 5% relative improvement in NLI.",
+    "authors": [
+      "Ujan Deb",
+      "Ridayesh Parab",
+      "Preethi Jyothi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.39",
+    "point2d": [
+      -60.07090377807617,
+      -19.905933380126953
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 951,
+    "title": "Context-Aware Transformer Pre-Training for Answer Sentence Selection",
+    "abstract": "Answer Sentence Selection (AS2) is a core component for building an accurate Question Answering pipeline. AS2 models rank a set of candidate sentences based on how likely they answer a given question. The state of the art in AS2 exploits pre-trained transformers by transferring them on large annotated datasets, while using local contextual information around the candidate sentence. In this paper, we propose three pre-training objectives designed to mimic the downstream fine-tuning task of contextual AS2. This allows for specializing LMs when fine-tuning for contextual AS2. Our experiments on three public and two large-scale industrial datasets show that our pre-training approaches (applied to RoBERTa and ELECTRA) can improve baseline contextual AS2 accuracy by up to 8% on some datasets.",
+    "authors": [
+      "Luca Di Liello",
+      "Siddhant Garg",
+      "Alessandro Moschitti"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.40",
+    "point2d": [
+      65.63296508789062,
+      14.502105712890625
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 952,
+    "title": "Toward Expanding the Scope of Radiology Report Summarization to Multiple Anatomies and Modalities",
+    "abstract": "Radiology report summarization (RRS) is a growing area of research. Given the Findings section of a radiology report, the goal is to generate a summary (called an Impression section) that highlights the key observations and conclusions of the radiology study. However, RRS currently faces essential limitations.First, many prior studies conduct experiments on private datasets, preventing reproduction of results and fair comparisons across different systems and solutions. Second, most prior approaches are evaluated solely on chest X-rays. To address these limitations, we propose a dataset (MIMIC-RRS) involving three new modalities and seven new anatomies based on the MIMIC-III and MIMIC-CXR datasets. We then conduct extensive experiments to evaluate the performance of models both within and across modality-anatomy pairs in MIMIC-RRS. In addition, we evaluate their clinical efficacy via RadGraph, a factual correctness metric.",
+    "authors": [
+      "Zhihong Chen",
+      "Maya Varma",
+      "Xiang Wan",
+      "Curtis Langlotz",
+      "Jean-Benoit Delbrouck"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.41",
+    "point2d": [
+      33.88098907470703,
+      -33.370574951171875
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 953,
+    "title": "Efficient Diagnosis Assignment Using Unstructured Clinical Notes",
+    "abstract": "Electronic phenotyping entails using electronic health records (EHRs) to identify patients with specific health outcomes and determine when those outcomes occurred. Unstructured clinical notes, which contain a vast amount of information, are a valuable resource for electronic phenotyping. However, traditional methods, such as rule-based labeling functions or neural networks, require significant manual effort to tune and may not generalize well to multiple indications. To address these challenges, we propose HyDE (hybrid diagnosis extractor). HyDE is a simple framework for electronic phenotyping that integrates labeling functions and a disease-agnostic neural network to assign diagnoses to patients. By training HyDE\u2019s model to correct predictions made by labeling functions, we are able to disambiguate hypertension true positives and false positives with a supervised area under the precision-recall curve (AUPRC) of 0.85. We extend this hypertension-trained model to zero-shot evaluation of four other diseases, generating AUPRC values ranging from 0.82 - 0.95 and outperforming a labeling function baseline by 44 points in F1 score and a Word2Vec baseline by 24 points in F1 score on average. Furthermore, we demonstrate a speedup of >4x by pruning the length of inputs into our language model to ~2.3% of the full clinical notes, with negligible impact to the AUPRC. HyDE has the potential to improve the efficiency and efficacy of interpreting large-scale unstructured clinical notes for accurate EHR phenotyping.",
+    "authors": [
+      "Louis Blankemeier",
+      "Jason Fries",
+      "Robert Tinn",
+      "Joseph Preston",
+      "Nigam Shah",
+      "Akshay Chaudhari"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.42",
+    "point2d": [
+      31.775312423706055,
+      -41.364479064941406
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 954,
+    "title": "MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models",
+    "abstract": "Large-scale language models have shown the ability to adapt to a new task via conditioning on a few demonstrations (i.e., in-context learning). However, in the vision-language domain, most large-scale pre-trained vision-language (VL) models do not possess the ability to conduct in-context learning. How can we enable in-context learning for VL models? In this paper, we study an interesting hypothesis: can we transfer the in-context learning ability from the language domain to the VL domain? Specifically, we first meta-trains a language model to perform in-context learning on NLP tasks (as in MetaICL); then we transfer this model to perform VL tasks by attaching a visual encoder. Our experiments suggest that indeed in-context learning ability can be transferred cross modalities: our model considerably improves the in-context learning capability on VL tasks and can even compensate for the size of the model significantly. On VQA, OK-VQA, and GQA, our method could outperform the baseline model while having ~20 times fewer parameters.",
+    "authors": [
+      "Masoud Monajatipoor",
+      "Liunian Harold Li",
+      "Mozhdeh Rouhsedaghat",
+      "Lin Yang",
+      "Kai-Wei Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.43",
+    "point2d": [
+      -52.272621154785156,
+      33.93640899658203
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 955,
+    "title": "On the Interpretability and Significance of Bias Metrics in Texts: a PMI-based Approach",
+    "abstract": "In recent years, word embeddings have been widely used to measure biases in texts. Even if they have proven to be effective in detecting a wide variety of biases, metrics based on word embeddings lack transparency and interpretability. We analyze an alternative PMI-based metric to quantify biases in texts. It can be expressed as a function of conditional probabilities, which provides a simple interpretation in terms of word co-occurrences. We also prove that it can be approximated by an odds ratio, which allows estimating confidence intervals and statistical significance of textual biases. This approach produces similar results to metrics based on word embeddings when capturing gender gaps of the real world embedded in large corpora.",
+    "authors": [
+      "Francisco Valentini",
+      "Germ\u00e1n Rosati",
+      "Dami\u00e1n Blasi",
+      "Diego Fernandez Slezak",
+      "Edgar Altszyler"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.44",
+    "point2d": [
+      20.772512435913086,
+      32.009395599365234
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 956,
+    "title": "Surface-Based Retrieval Reduces Perplexity of Retrieval-Augmented Language Models",
+    "abstract": "Augmenting language models with a retrieval mechanism has been shown to significantly improve their performance while keeping the number of parameters low. Retrieval-augmented models commonly rely on a semantic retrieval mechanism based on the similarity between dense representations of the query chunk and potential neighbors. In this paper, we study the state-of-the-art Retro model and observe that its performance gain is better explained by surface-level similarities, such as token overlap. Inspired by this, we replace the semantic retrieval in Retro with a surface-level method based on BM25, obtaining a significant reduction in perplexity. As full BM25 retrieval can be computationally costly for large datasets, we also apply it in a re-ranking scenario, gaining part of the perplexity reduction with minimal computational overhead.",
+    "authors": [
+      "Ehsan Doostmohammadi",
+      "Tobias Norlund",
+      "Marco Kuhlmann",
+      "Richard Johansson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.45",
+    "point2d": [
+      10.984601020812988,
+      -16.685705184936523
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 957,
+    "title": "MIReAD: Simple Method for Learning High-quality Representations from Scientific Documents",
+    "abstract": "Learning semantically meaningful representations from scientific documents can facilitate academic literature search and improve performance of recommendation systems. Pretrained language models have been shown to learn rich textual representations, yet they cannot provide powerful document-level representations for scientific articles. We propose MIReAD, a simple method that learns highquality representations of scientific papers by fine-tuning transformer model to predict the target journal class based on the abstract. We train MIReAD on more than 500,000 PubMed and arXiv abstracts across over 2,000 journal classes. We show that MIReAD produces representations that can be used for similar papers retrieval, topic categorization and literature search. Our proposed approach outperforms six existing models for representation learning on scientific documents across four evaluation standards.",
+    "authors": [
+      "Anastasiia Razdaibiedina",
+      "Aleksandr Brechalov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.46",
+    "point2d": [
+      15.491491317749023,
+      4.699963092803955
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 958,
+    "title": "KNOW How to Make Up Your Mind! Adversarially Detecting and Alleviating Inconsistencies in Natural Language Explanations",
+    "abstract": "While recent works have been considerably improving the quality of the natural language explanations (NLEs) generated by a model to justify its predictions, there is very limited research in detecting and alleviating inconsistencies among generated NLEs. In this work, we leverage external knowledge bases to significantly improve on an existing adversarial attack for detecting inconsistent NLEs. We apply our attack to high-performing NLE models and show that models with higher NLE quality do not necessarily generate fewer inconsistencies. Moreover, we propose an off-the-shelf mitigation method to alleviate inconsistencies by grounding the model into external background knowledge. Our method decreases the inconsistencies of previous high-performing NLE models as detected by our attack.",
+    "authors": [
+      "Myeongjun Jang",
+      "Bodhisattwa Prasad Majumder",
+      "Julian McAuley",
+      "Thomas Lukasiewicz",
+      "Oana-Maria Camburu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.47",
+    "point2d": [
+      26.98800277709961,
+      -4.0786237716674805
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 959,
+    "title": "Measuring the Effect of Influential Messages on Varying Personas",
+    "abstract": "Predicting how a user responds to news events enables important applications such as allowing intelligent agents or content producers to estimate the effect on different communities and revise unreleased messages to prevent unexpected bad outcomes such as social conflict and moral injury. We present a new task, Response Forecasting on Personas for News Media, to estimate the response a persona (characterizing an individual or a group) might have upon seeing a news message. Compared to the previous efforts which only predict generic comments to news, the proposed task not only introduces personalization in the modeling but also predicts the sentiment polarity and intensity of each response. This enables more accurate and comprehensive inference on the mental state of the persona. Meanwhile, the generated sentiment dimensions make the evaluation and application more reliable. We create the first benchmark dataset, which consists of 13,357 responses to 3,847 news headlines from Twitter. We further evaluate the SOTA neural language models with our dataset. The empirical results suggest that the included persona attributes are helpful for the performance of all response dimensions. Our analysis shows that the best-performing models are capable of predicting responses that are consistent with the personas, and as a byproduct, the task formulation also enables many interesting applications in the analysis of social network groups and their opinions, such as the discovery of extreme opinion groups.",
+    "authors": [
+      "Chenkai Sun",
+      "Jinning Li",
+      "Hou Pong Chan",
+      "ChengXiang Zhai",
+      "Heng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.48",
+    "point2d": [
+      18.98798942565918,
+      36.39822006225586
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 960,
+    "title": "Going Beyond Sentence Embeddings: A Token-Level Matching Algorithm for Calculating Semantic Textual Similarity",
+    "abstract": "Semantic Textual Similarity (STS) measures the degree to which the underlying semantics of paired sentences are equivalent. State-of-the-art methods for STS task use language models to encode sentences into embeddings. However, these embeddings are limited in representing semantics because they mix all the semantic information together in fixed-length vectors, which are difficult to recover and lack explainability. This paper presents a token-level matching inference algorithm, which can be applied on top of any language model to improve its performance on STS task. Our method calculates pairwise token-level similarity and token matching scores, and then aggregates them with pretrained token weights to produce sentence similarity. Experimental results on seven STS datasets show that our method improves the performance of almost all language models, with up to 12.7% gain in Spearman\u2019s correlation. We also demonstrate that our method is highly explainable and computationally efficient.",
+    "authors": [
+      "Hongwei Wang",
+      "Dong Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.49",
+    "point2d": [
+      1.5337532758712769,
+      -36.36800003051758
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 961,
+    "title": "Robust Learning for Multi-party Addressee Recognition with Discrete Addressee Codebook",
+    "abstract": "Addressee recognition aims to identify addressees in multi-party conversations. While state-of-the-art addressee recognition models have achieved promising performance, they still suffer from the issue of robustness when applied in real-world scenes. When exposed to a noisy environment, these models regard the noise as input and identify the addressee in a pre-given addressee closed set, while the addressees of the noise do not belong to this closed set, thus leading to the wrong identification of addressee. To this end, we propose a Robust Addressee Recognition (RAR) method, which discrete the addressees into a character codebook, making it able to represent open set addressees and robust in a noisy environment. Experimental results show that the introduction of the addressee character codebook helps to represent the open set addressees and highly improves the robustness of addressee recognition even if the input is noise.",
+    "authors": [
+      "Pengcheng Zhu",
+      "Wei Zhou",
+      "Kuncai Zhang",
+      "Yuankai Ma",
+      "Haiqing Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.50",
+    "point2d": [
+      3.663747787475586,
+      62.096771240234375
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 962,
+    "title": "TwistList: Resources and Baselines for Tongue Twister Generation",
+    "abstract": "Previous work in phonetically-grounded language generation has mainly focused on domains such as lyrics and poetry. In this paper, we present work on the generation of tongue twisters - a form of language that is required to be phonetically conditioned to maximise sound overlap, whilst maintaining semantic consistency with an input topic, and still being grammatically correct. We present TwistList, a large annotated dataset of tongue twisters, consisting of 2.1K+ human-authored examples. We additionally present several benchmark systems (referred to as TwisterMisters) for the proposed task of tongue twister generation, including models that both do and do not require training on in-domain data. We present the results of automatic and human evaluation to demonstrate the performance ofexisting mainstream pre-trained models in this task with limited (or no) task specific training and data, and no explicit phonetic knowledge. We find that the task of tongue twister generation is challenging for models under these conditions, yet some models are still capable of generating acceptable examples of this language type.",
+    "authors": [
+      "Tyler Loakman",
+      "Chen Tang",
+      "Chenghua Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.51",
+    "point2d": [
+      -31.322690963745117,
+      40.39068603515625
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 963,
+    "title": "Substitution-based Semantic Change Detection using Contextual Embeddings",
+    "abstract": "Measuring semantic change has thus far remained a task where methods using contextual embeddings have struggled to improve upon simpler techniques relying only on static word vectors. Moreover, many of the previously proposed approaches suffer from downsides related to scalability and ease of interpretation. We present a simplified approach to measuring semantic change using contextual embeddings, relying only on the most probable substitutes for masked terms. Not only is this approach directly interpretable, it is also far more efficient in terms of storage, achieves superior average performance across the most frequently cited datasets for this task, and allows for more nuanced investigation of change than is possible with static word vectors.",
+    "authors": [
+      "Dallas Card"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.52",
+    "point2d": [
+      3.747049331665039,
+      -44.53304672241211
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 964,
+    "title": "Probing Physical Reasoning with Counter-Commonsense Context",
+    "abstract": "In this study, we create a CConS (Counter-commonsense Contextual Size comparison) dataset to investigate how physical commonsense affects the contextualized size comparison task; the proposed dataset consists of both contexts that fit physical commonsense and those that do not.This dataset tests the ability of language models to predict the size relationship between objects under various contexts generated from our curated noun list and templates.We measure the ability of several masked language models and encoder-decoder models.The results show that while large language models can use prepositions such as \u201cin\u201d and \u201cinto\u201d in the provided context to infer size relationships, they fail to use verbs and thus make incorrect judgments led by their prior physical commonsense.",
+    "authors": [
+      "Kazushi Kondo",
+      "Saku Sugawara",
+      "Akiko Aizawa"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.53",
+    "point2d": [
+      38.241004943847656,
+      -6.725469589233398
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 965,
+    "title": "Morphological Inflection with Phonological Features",
+    "abstract": "Recent years have brought great advances into solving morphological tasks, mostly due to powerful neural models applied to various tasks as (re)inflection and analysis. Yet, such morphological tasks cannot be considered solved, especially when little training data is available or when generalizing to previously unseen lemmas. This work explores effects on performance obtained through various ways in which morphological models get access to sub-character phonological features that are often the targets of morphological processes. We design two methods to achieve this goal: one that leaves models as is but manipulates the data to include features instead of characters, and another that manipulates models to take phonological features into account when building representations for phonemes. We elicit phonemic data from standard graphemic data using language-specific grammars for languages with shallow grapheme-to-phoneme mapping, and we experiment with two reinflection models over eight languages. Our results show that our methods yield comparable results to the grapheme-based baseline overall, with minor improvements in some of the languages. All in all, we conclude that patterns in character distributions are likely to allow models to infer the underlying phonological characteristics, even when phonemes are not explicitly represented.",
+    "authors": [
+      "David Guriel",
+      "Omer Goldman",
+      "Reut Tsarfaty"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.54",
+    "point2d": [
+      -36.398406982421875,
+      -42.93223571777344
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 966,
+    "title": "A Holistic Approach to Reference-Free Evaluation of Machine Translation",
+    "abstract": "Traditional machine translation evaluation relies on reference written by humans. While reference-free evaluation gets rid of the constraints of labor-intensive annotations, which can pivot easily to new domains and is more scalable. In this paper, we propose a reference-free evaluation approach that characterizes evaluation as two aspects: (1) fluency: how well the translated text conforms to normal human language usage; (2) faithfulness: how well the translated text reflects the source data. We further split the faithfulness into word-level and sentence-level. Extensive experiments spanning WMT18/19/21 Metrics segment-level daRR and MQM datasets demonstrate that our proposed reference-free approach, ReFreeEval, outperforms SOTA reference-fee metrics like YiSi-2.",
+    "authors": [
+      "Hanming Wu",
+      "Wenjuan Han",
+      "Hui Di",
+      "Yufeng Chen",
+      "Jinan Xu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.55",
+    "point2d": [
+      -73.13639068603516,
+      -2.761261463165283
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 967,
+    "title": "Balancing Lexical and Semantic Quality in Abstractive Summarization",
+    "abstract": "An important problem of the sequence-to-sequence neural models widely used in abstractive summarization is exposure bias. To alleviate this problem, re-ranking systems have been applied in recent years. Despite some performance improvements, this approach remains underexplored. Previous works have mostly specified the rank through the ROUGE score and aligned candidate summaries, but there can be quite a large gap between the lexical overlap metric and semantic similarity. In this paper, we propose a novel training method in which a re-ranker balances the lexical and semantic quality. We further newly define false positives in ranking and present a strategy to reduce their influence. Experiments on the CNN/DailyMail and XSum datasets show that our method can estimate the meaning of summaries without seriously degrading the lexical aspect. More specifically, it achieves an 89.67 BERTScore on the CNN/DailyMail dataset, reaching new state-of-the-art performance. Our code is publicly available at https://github.com/jeewoo1025/BalSum.",
+    "authors": [
+      "Jeewoo Sul",
+      "Yong Suk Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.56",
+    "point2d": [
+      -6.039032936096191,
+      42.46247863769531
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 968,
+    "title": "Learning Neuro-Symbolic World Models with Conversational Proprioception",
+    "abstract": "The recent emergence of Neuro-Symbolic Agent (NeSA) approaches to natural language-based interactions calls for the investigation of model-based approaches. In contrast to model-free approaches, which existing NeSAs take, learning an explicit world model has an interesting potential especially in the explainability, which is one of the key selling points of NeSA. To learn useful world models, we leverage one of the recent neuro-symbolic architectures, Logical Neural Networks (LNN). Here, we describe a method that can learn neuro-symbolic world models on the TextWorld-Commonsense set of games.We then show how this can be improved further by taking inspiration from the concept of proprioception, but for conversation. This is done by enhancing the internal logic state with a memory of previous actions while also guiding future actions by augmenting the learned model with constraints based on this memory.This greatly improves the game-solving agents performance in a TextWorld setting, where the advantage over the baseline is an 85% average steps reduction and x2.3 average score.",
+    "authors": [
+      "Don Joven Agravante",
+      "Daiki Kimura",
+      "Michiaki Tatsubori",
+      "Asim Munawar",
+      "Alexander Gray"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.57",
+    "point2d": [
+      52.13882827758789,
+      -5.4289445877075195
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 969,
+    "title": "In and Out-of-Domain Text Adversarial Robustness via Label Smoothing",
+    "abstract": "Recently it has been shown that state-of-the-art NLP models are vulnerable to adversarial attacks, where the predictions of a model can be drastically altered by slight modifications to the input (such as synonym substitutions). While several defense techniques have been proposed, and adapted, to the discrete nature of text adversarial attacks, the benefits of general-purpose regularization methods such as label smoothing for language models, have not been studied. In this paper, we study the adversarial robustness provided by label smoothing strategies in foundational models for diverse NLP tasks in both in-domain and out-of-domain settings. Our experiments show that label smoothing significantly improves adversarial robustness in pre-trained models like BERT, against various popular attacks. We also analyze the relationship between prediction confidence and robustness, showing that label smoothing reduces over-confident errors on adversarial examples.",
+    "authors": [
+      "Yahan Yang",
+      "Soham Dan",
+      "Dan Roth",
+      "Insup Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.58",
+    "point2d": [
+      2.783439874649048,
+      7.340229511260986
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 970,
+    "title": "LM-CPPF: Paraphrasing-Guided Data Augmentation for Contrastive Prompt-Based Few-Shot Fine-Tuning",
+    "abstract": "In recent years, there has been significant progress in developing pre-trained language models for NLP. However, these models often struggle when fine-tuned on small datasets. To address this issue, researchers have proposed various adaptation approaches. Prompt-based tuning is arguably the most common way, especially for larger models. Previous research shows that adding contrastive learning to prompt-based fine-tuning is effective as it helps the model generate embeddings that are more distinguishable between classes, and it can also be more sample-efficient as the model learns from positive and negative examples simultaneously. One of the most important components of contrastive learning is data augmentation, but unlike computer vision, effective data augmentation for NLP is still challenging. This paper proposes LM-CPPF, Contrastive Paraphrasing-guided Prompt-based Fine-tuning of Language Models, which leverages prompt-based few-shot paraphrasing using generative language models, especially large language models such as GPT-3 and OPT-175B, for data augmentation. Our experiments on multiple text classification benchmarks show that this augmentation method outperforms other methods, such as easy data augmentation, back translation, and multiple templates.",
+    "authors": [
+      "Amirhossein Abaskohi",
+      "Sascha Rothe",
+      "Yadollah Yaghoobzadeh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.59",
+    "point2d": [
+      -19.639299392700195,
+      -5.609066486358643
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 971,
+    "title": "Considerations for meaningful sign language machine translation based on glosses",
+    "abstract": "Automatic sign language processing is gaining popularity in Natural Language Processing (NLP) research (Yin et al., 2021). In machine translation (MT) in particular, sign language translation based on glosses is a prominent approach. In this paper, we review recent works on neural gloss translation. We find that limitations of glosses in general and limitations of specific datasets are not discussed in a transparent manner and that there is no common standard for evaluation.To address these issues, we put forward concrete recommendations for future research on gloss translation. Our suggestions advocate awareness of the inherent limitations of gloss-based approaches, realistic datasets, stronger baselines and convincing evaluation.",
+    "authors": [
+      "Mathias M\u00fcller",
+      "Zifan Jiang",
+      "Amit Moryossef",
+      "Annette Rios",
+      "Sarah Ebling"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.60",
+    "point2d": [
+      -81.15345764160156,
+      21.404138565063477
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 972,
+    "title": "Detecting Contradictory COVID-19 Drug Efficacy Claims from Biomedical Literature",
+    "abstract": "The COVID-19 pandemic created a deluge of questionable and contradictory scientific claims about drug efficacy \u2013 an \u201cinfodemic\u201d with lasting consequences for science and society. In this work, we argue that NLP models can help domain experts distill and understand the literature in this complex, high-stakes area. Our task is to automatically identify contradictory claims about COVID-19 drug efficacy. We frame this as a natural language inference problem and offer a new NLI dataset created by domain experts. The NLI framing allows us to create curricula combining existing datasets and our own. The resulting models are useful investigative tools. We provide a case study of how these models help a domain expert summarize and assess evidence concerning remdisivir and hydroxychloroquine.",
+    "authors": [
+      "Daniel Sosa",
+      "Malavika Suresh",
+      "Christopher Potts",
+      "Russ Altman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.61",
+    "point2d": [
+      30.73814582824707,
+      12.223459243774414
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 973,
+    "title": "The Role of Global and Local Context in Named Entity Recognition",
+    "abstract": "Pre-trained transformer-based models have recently shown great performance when applied to Named Entity Recognition (NER). As the complexity of their self-attention mechanism prevents them from processing long documents at once, these models are usually applied in a sequential fashion. Such an approach unfortunately only incorporates local context and prevents leveraging global document context in long documents such as novels, which might hinder performance. In this article, we explore the impact of global document context, and its relationships with local context. We find that correctly retrieving global document context has a greater impact on performance than only leveraging local context, prompting for further research on how to better retrieve that context.",
+    "authors": [
+      "Arthur Amalvy",
+      "Vincent Labatut",
+      "Richard Dufour"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.62",
+    "point2d": [
+      30.703163146972656,
+      -84.74752044677734
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 974,
+    "title": "Joint End-to-end Semantic Proto-role Labeling",
+    "abstract": "Semantic proto-role labeling (SPRL) assigns properties to arguments based on a series of binary labels. While multiple studies have evaluated various approaches to SPRL, it has only been studied in-depth as a standalone task using gold predicate/argument pairs. How do SPRL systems perform as part of an information extraction pipeline? We model SPRL jointly with predicate-argument extraction using a deep transformer model. We find that proto-role labeling is surprisingly robust in this setting, with only a small decrease when using predicted arguments. We include a detailed analysis of each component of the joint system, and an error analysis to understand correlations in errors between system stages. Finally, we study the effects of annotation errors on SPRL.",
+    "authors": [
+      "Elizabeth Spaulding",
+      "Gary Kazantsev",
+      "Mark Dredze"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.63",
+    "point2d": [
+      42.353519439697266,
+      -47.20297622680664
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 975,
+    "title": "Improving Automatic Quotation Attribution in Literary Novels",
+    "abstract": "Current models for quotation attribution in literary novels assume varying levels of available information in their training and test data, which poses a challenge for in-the-wild inference. Here, we approach quotation attribution as a set of four interconnected sub-tasks: character identification, coreference resolution, quotation identification, and speaker attribution. We benchmark state-of-the-art models on each of these sub-tasks independently, using a large dataset of annotated coreferences and quotations in literary novels (the Project Dialogism Novel Corpus). We also train and evaluate models for the speaker attribution task in particular, showing that a simple sequential prediction model achieves accuracy scores on par with state-of-the-art models.",
+    "authors": [
+      "Krishnapriya Vishnubhotla",
+      "Frank Rudzicz",
+      "Graeme Hirst",
+      "Adam Hammond"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.64",
+    "point2d": [
+      -4.612410068511963,
+      25.346399307250977
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 976,
+    "title": "Modular Visual Question Answering via Code Generation",
+    "abstract": "We present a framework that formulates visual question answering as modular code generation. In contrast to prior work on modular approaches to VQA, our approach requires no additional training and relies on pre-trained language models (LMs), visual models pre-trained on image-caption pairs, and fifty VQA examples used for in-context learning. The generated Python programs invoke and compose the outputs of the visual models using arithmetic and conditional logic. Our approach improves accuracy on the COVR dataset by at least 3% and on the GQA dataset by 2% compared to the few-shot baseline that does not employ code generation.",
+    "authors": [
+      "Sanjay Subramanian",
+      "Medhini Narasimhan",
+      "Kushal Khangaonkar",
+      "Kevin Yang",
+      "Arsha Nagrani",
+      "Cordelia Schmid",
+      "Andy Zeng",
+      "Trevor Darrell",
+      "Dan Klein"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.65",
+    "point2d": [
+      56.45309829711914,
+      4.362204074859619
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 977,
+    "title": "Target-Based Offensive Language Identification",
+    "abstract": "We present TBO, a new dataset for Target-based Offensive language identification. TBO contains post-level annotations regarding the harmfulness of an offensive post and token-level annotations comprising of the target and the offensive argument expression. Popular offensive language identification datasets for social media focus on annotation taxonomies only at the post level and more recently, some datasets have been released that feature only token-level annotations. TBO is an important resource that bridges the gap between post-level and token-level annotation datasets by introducing a single comprehensive unified annotation taxonomy. We use the TBO taxonomy to annotate post-level and token-level offensive language on English Twitter posts. We release an initial dataset of over 4,500 instances collected from Twitter and we carry out multiple experiments to compare the performance of different models trained and tested on TBO.",
+    "authors": [
+      "Marcos Zampieri",
+      "Skye Morgan",
+      "Kai North",
+      "Tharindu Ranasinghe",
+      "Austin Simmmons",
+      "Paridhi Khandelwal",
+      "Sara Rosenthal",
+      "Preslav Nakov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.66",
+    "point2d": [
+      29.879140853881836,
+      22.110015869140625
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 978,
+    "title": "Unsupervised Subtitle Segmentation with Masked Language Models",
+    "abstract": "We describe a novel unsupervised approach to subtitle segmentation, based on pretrained masked language models, where line endings and subtitle breaks are predicted according to the likelihood of punctuation to occur at candidate segmentation points. Our approach obtained competitive results in terms of segmentation accuracy across metrics, while also fully preserving the original text and complying with length constraints. Although supervised models trained on in-domain data and with access to source audio information can provide better segmentation accuracy, our approach is highly portable across languages and domains and may constitute a robust off-the-shelf solution for subtitle segmentation.",
+    "authors": [
+      "David Ponce",
+      "Thierry Etchegoyhen",
+      "Victor Ruiz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.67",
+    "point2d": [
+      -48.8609619140625,
+      9.83353042602539
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 979,
+    "title": "Exploring Continual Learning for Code Generation Models",
+    "abstract": "Large-scale code generation models such as Copilot and CodeT5 have achieved impressive performance. However, libraries are upgraded or deprecated very frequently and re-training large-scale language models is computationally expensive. Therefore, Continual Learning (CL) is an important aspect that remains under-explored in the code domain. In this paper, we introduce a benchmark called CodeTask-CL that covers a wide range of tasks, including code generation, translation, summarization, and refinement, with different input and output programming languages. Next, on our CodeTask-CL benchmark, we compare popular CL techniques from NLP and Vision domains. We find that effective methods like Prompt Pooling (PP) suffer from catastrophic forgetting due to the unstable training of the prompt selection mechanism caused by stark distribution shifts in coding tasks. We address this issue with our proposed method, Prompt Pooling with Teacher Forcing (PP-TF), that stabilizes training by enforcing constraints on the prompt selection mechanism and leads to a 21.54% improvement over Prompt Pooling. Along with the benchmark, we establish a training pipeline that can be used for CL on code models, which we believe can motivate further development of CL methods for code models.",
+    "authors": [
+      "Prateek Yadav",
+      "Qing Sun",
+      "Hantian Ding",
+      "Xiaopeng Li",
+      "Dejiao Zhang",
+      "Ming Tan",
+      "Parminder Bhatia",
+      "Xiaofei Ma",
+      "Ramesh Nallapati",
+      "Murali Krishna Ramanathan",
+      "Mohit Bansal",
+      "Bing Xiang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.68",
+    "point2d": [
+      -10.205127716064453,
+      -50.07067108154297
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 980,
+    "title": "Deep Active Learning for Morphophonological Processing",
+    "abstract": "Building a system for morphological processing is a challenging task in morphologically complex languages like Arabic. Although there are some deep learning based models that achieve successful results, these models rely on a large amount of annotated data. Building such datasets, specially for some of the lower-resource Arabic dialects, is very difficult, time-consuming, and expensive. In addition, some parts of the annotated data do not contain useful information for training machine learning models. Active learning strategies allow the learner algorithm to select the most informative samples for annotation. There has been little research that focuses on applying active learning for morphological inflection and morphophonological processing. In this paper, we have proposed a deep active learning method for this task. Our experiments on Egyptian Arabic show that with only about 30% of annotated data, we achieve the same results as does the state-of-the-art model on the whole dataset.",
+    "authors": [
+      "Seyed Morteza Mirbostani",
+      "Yasaman Boreshban",
+      "Salam Khalifa",
+      "SeyedAbolghasem Mirroshandel",
+      "Owen Rambow"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.69",
+    "point2d": [
+      -34.76725387573242,
+      -45.45070266723633
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 981,
+    "title": "Counterfactual reasoning: Testing language models\u2019 understanding of hypothetical scenarios",
+    "abstract": "Current pre-trained language models have enabled remarkable improvements in downstream tasks, but it remains difficult to distinguish effects of statistical correlation from more systematic logical reasoning grounded on the understanding of real world. We tease these factors apart by leveraging counterfactual conditionals, which force language models to predict unusual consequences based on hypothetical propositions. We introduce a set of tests from psycholinguistic experiments, as well as larger-scale controlled datasets, to probe counterfactual predictions from five pre-trained language models. We find that models are consistently able to override real-world knowledge in counterfactual scenarios, and that this effect is more robust in case of stronger baseline world knowledge\u2014however, we also find that for most models this effect appears largely to be driven by simple lexical cues. When we mitigate effects of both world knowledge and lexical cues to test knowledge of linguistic nuances of counterfactuals, we find that only GPT-3 shows sensitivity to these nuances, though this sensitivity is also non-trivially impacted by lexical associative factors.",
+    "authors": [
+      "Jiaxuan Li",
+      "Lang Yu",
+      "Allyson Ettinger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.70",
+    "point2d": [
+      32.33134841918945,
+      -2.9794118404388428
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 982,
+    "title": "Bhasa-Abhijnaanam: Native-script and romanized Language Identification for 22 Indic languages",
+    "abstract": "We create publicly available language identification (LID) datasets and models in all 22 Indian languages listed in the Indian constitution in both native-script and romanized text. First, we create Bhasha-Abhijnaanam, a language identification test set for native-script as well as romanized text which spans all 22 Indic languages. We also train IndicLID, a language identifier for all the above-mentioned languages in both native and romanized script. For native-script text, it has better language coverage than existing LIDs and is competitive or better than other LIDs. IndicLID is the first LID for romanized text in Indian languages. Two major challenges for romanized text LID are the lack of training data and low-LID performance when languages are similar. We provide simple and effective solutions to these problems. In general, there has been limited work on romanized text in any language, and our findings are relevant to other languages that need romanized language identification. Our models are publicly available at https://github.com/AI4Bharat/IndicLID under open-source licenses. Our training and test sets are also publicly available at https://huggingface.co/datasets/ai4bharat/Bhasha-Abhijnaanam under open-source licenses.",
+    "authors": [
+      "Yash Madhani",
+      "Mitesh M. Khapra",
+      "Anoop Kunchukuttan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.71",
+    "point2d": [
+      -52.05398941040039,
+      -5.526007652282715
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 983,
+    "title": "Using contradictions improves question answering systems",
+    "abstract": "This work examines the use of contradiction in natural language inference (NLI) for question answering (QA). Typically, NLI systems help answer questions by determining if a potential answer is entailed (supported) by some background context. But is it useful to also determine if an answer contradicts the context? We test this in two settings, multiple choice and extractive QA, and find that systems that incorporate contradiction can do slightly better than entailment-only systems on certain datasets. However, the best performances come from using contradiction, entailment, and QA model confidence scores together. This has implications for the deployment of QA systems in domains such as medicine and science where safety is an issue.",
+    "authors": [
+      "Etienne Fortier-Dubois",
+      "Domenic Rosati"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.72",
+    "point2d": [
+      62.11837387084961,
+      -1.8540077209472656
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 984,
+    "title": "Token-Level Self-Evolution Training for Sequence-to-Sequence Learning",
+    "abstract": "Adaptive training approaches, widely used in sequence-to-sequence models, commonly reweigh the losses of different target tokens based on priors, e.g. word frequency. However, most of them do not consider the variation of learning difficulty in different training steps, and overly emphasize the learning of difficult one-hot labels, making the learning deterministic and sub-optimal. In response, we present Token-Level Self-Evolution Training (SE), a simple and effective dynamic training method to fully and wisely exploit the knowledge from data. SE focuses on dynamically learning the under-explored tokens for each forward pass and adaptively regularizes the training by introducing a novel token-specific label smoothing approach. Empirically, SE yields consistent and significant improvements in three tasks, i.e. machine translation, summarization, and grammatical error correction. Encouragingly, we achieve averaging +0.93 BLEU improvement on three machine translation tasks. Analyses confirm that, besides improving lexical accuracy, SE enhances generation diversity and model generalization.",
+    "authors": [
+      "Keqin Peng",
+      "Liang Ding",
+      "Qihuang Zhong",
+      "Yuanxin Ouyang",
+      "Wenge Rong",
+      "Zhang Xiong",
+      "Dacheng Tao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.73",
+    "point2d": [
+      -25.843164443969727,
+      2.1572158336639404
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 985,
+    "title": "Gradient Ascent Post-training Enhances Language Model Generalization",
+    "abstract": "In this work, we empirically show that updating pretrained LMs (350M, 1.3B, 2.7B) with just a few steps of Gradient Ascent Post-training (GAP) on random, unlabeled text corpora enhances its zero-shot generalization capabilities across diverse NLP tasks. Specifically, we show that GAP can allow LMs to become comparable to 2-3x times larger LMs across 12 different NLP tasks. We also show that applying GAP on out-of-distribution corpora leads to the most reliable performance improvements. Our findings indicate that GAP can be a promising method for improving the generalization capability of LMs without any task-specific fine-tuning.",
+    "authors": [
+      "Dongkeun Yoon",
+      "Joel Jang",
+      "Sungdong Kim",
+      "Minjoon Seo"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.74",
+    "point2d": [
+      -26.609607696533203,
+      -14.126792907714844
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 986,
+    "title": "An Open Dataset and Model for Language Identification",
+    "abstract": "Language identification (LID) is a fundamental step in many natural language processing pipelines. However, current LID systems are far from perfect, particularly on lower-resource languages. We present a LID model which achieves a macro-average F1 score of 0.93 and a false positive rate of 0.033% across 201 languages, outperforming previous work. We achieve this by training on a curated dataset of monolingual data, which we audit manually to ensure reliability. We make both the model and the dataset available to the research community. Finally, we carry out detailed analysis into our model\u2019s performance, both in comparison to existing open models and by language class.",
+    "authors": [
+      "Laurie Burchell",
+      "Alexandra Birch",
+      "Nikolay Bogoychev",
+      "Kenneth Heafield"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.75",
+    "point2d": [
+      -51.628578186035156,
+      -6.805135250091553
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 987,
+    "title": "Evaluating Paraphrastic Robustness in Textual Entailment Models",
+    "abstract": "We present PaRTE, a collection of 1,126 pairs of Recognizing Textual Entailment (RTE) examples to evaluate whether models are robust to paraphrasing. We posit that if RTE models understand language, their predictions should be consistent across inputs that share the same meaning. We use the evaluation set to determine if RTE models\u2019 predictions change when examples are paraphrased. In our experiments, contemporary models change their predictions on 8-16% of paraphrased examples, indicating that there is still room for improvement.",
+    "authors": [
+      "Dhruv Verma",
+      "Yash Kumar Lal",
+      "Shreyashee Sinha",
+      "Benjamin Van Durme",
+      "Adam Poliak"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.76",
+    "point2d": [
+      -10.175137519836426,
+      4.992674827575684
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 988,
+    "title": "Are Pre-trained Language Models Useful for Model Ensemble in Chinese Grammatical Error Correction?",
+    "abstract": "Model ensemble has been in widespread use for Grammatical Error Correction (GEC), boosting model performance. We hypothesize that model ensemble based on the perplexity (PPL) computed by pre-trained language models (PLMs) should benefit the GEC system. To this end, we explore several ensemble strategies based on strong PLMs with four sophisticated single models. However, the performance does not improve but even gets worse after the PLM-based ensemble. This surprising result sets us doing a detailed analysis on the data and coming up with some insights on GEC. The human references of correct sentences is far from sufficient in the test data, and the gap between a correct sentence and an idiomatic one is worth our attention. Moreover, the PLM-based ensemble strategies provide an effective way to extend and improve GEC benchmark data. Our source code is available at https://github.com/JamyDon/PLM-based-CGEC-Model-Ensemble.",
+    "authors": [
+      "Chenming Tang",
+      "Xiuyu Wu",
+      "Yunfang Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.77",
+    "point2d": [
+      -38.214298248291016,
+      12.906143188476562
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 989,
+    "title": "Improving Factuality of Abstractive Summarization without Sacrificing Summary Quality",
+    "abstract": "Improving factual consistency of abstractive summarization has been a widely studied topic. However, most of the prior works on training factuality-aware models have ignored the negative effect it has on summary quality. We propose {pasted macro \u2018MODEL\u2019}name (i.e. Effective Factual Summarization), a candidate summary generation and ranking technique to improve summary factuality without sacrificing quality. We show that using a contrastive learning framework with our refined candidate summaries leads to significant gains on both factuality and similarity-based metrics. Specifically, we propose a ranking strategy in which we effectively combine two metrics, thereby preventing any conflict during training. Models trained using our approach show up to 6 points of absolute improvement over the base model with respect to FactCC on XSUM and 11 points on CNN/DM, without negatively affecting either similarity-based metrics or absractiveness.",
+    "authors": [
+      "Tanay Dixit",
+      "Fei Wang",
+      "Muhao Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.78",
+    "point2d": [
+      -5.8823933601379395,
+      44.221160888671875
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 990,
+    "title": "With a Little Push, NLI Models can Robustly and Efficiently Predict Faithfulness",
+    "abstract": "Conditional language models still generate unfaithful output that is not supported by their input. These unfaithful generations jeopardize trust in real-world applications such as summarization or human-machine interaction, motivating a need for automatic faithfulness metrics. To implement such metrics, NLI models seem attractive, since they solve a strongly related task that comes with a wealth of prior research and data. But recent research suggests that NLI models require costly additional machinery to perform reliably across datasets, e.g., by running inference on a cartesian product of input and generated sentences, or supporting them with a question-generation/answering step.In this work we show that pure NLI models _can_ outperform more complex metrics when combining task-adaptive data augmentation with robust inference procedures. We propose: (1) Augmenting NLI training data toadapt NL inferences to the specificities of faithfulness prediction in dialogue;(2) Making use of both entailment and contradiction probabilities in NLI, and(3) Using Monte-Carlo dropout during inference.Applied to the TRUE benchmark, which combines faithfulness datasets across diverse domains and tasks, our approach strongly improves a vanilla NLI model and significantly outperforms previous work, while showing favourable computational cost.",
+    "authors": [
+      "Julius Steen",
+      "Juri Opitz",
+      "Anette Frank",
+      "Katja Markert"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.79",
+    "point2d": [
+      40.19601058959961,
+      -3.9742720127105713
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 991,
+    "title": "A Better Way to Do Masked Language Model Scoring",
+    "abstract": "Estimating the log-likelihood of a given sentence under an autoregressive language model is straightforward: one can simply apply the chain rule and sum the log-likelihood values for each successive token. However, for masked language models (MLMs), there is no direct way to estimate the log-likelihood of a sentence. To address this issue, Salazar et al. (2020) propose to estimate sentence pseudo-log-likelihood (PLL) scores, computed by successively masking each sentence token, retrieving its score using the rest of the sentence as context, and summing the resulting values. Here, we demonstrate that the original PLL method yields inflated scores for out-of-vocabulary words and propose an adapted metric, in which we mask not only the target token, but also all within-word tokens to the right of the target. We show that our adapted metric (PLL-word-l2r) outperforms both the original PLL metric and a PLL metric in which all within-word tokens are masked. In particular, it better satisfies theoretical desiderata and better correlates with scores from autoregressive models. Finally, we show that the choice of metric affects even tightly controlled, minimal pair evaluation benchmarks (such as BLiMP), underscoring the importance of selecting an appropriate scoring metric for evaluating MLM properties.",
+    "authors": [
+      "Carina Kauf",
+      "Anna Ivanova"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.80",
+    "point2d": [
+      -38.21598434448242,
+      -1.2850630283355713
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 992,
+    "title": "ChatGPT for Zero-shot Dialogue State Tracking: A Solution or an Opportunity?",
+    "abstract": "Recent research on dialog state tracking (DST) focuses on methods that allow few- and zero-shot transfer to new domains or schemas. However, performance gains heavily depend on aggressive data augmentation and fine-tuning of ever larger language model based architectures. In contrast, general purpose language models, trained on large amounts of diverse data, hold the promise of solving any kind of task without task-specific training. We present preliminary experimental results on the ChatGPT research preview, showing that ChatGPT achieves state-of-the-art performance in zero-shot DST. Despite our findings, we argue that properties inherent to general purpose models limit their ability to replace specialized systems. We further theorize that the in-context learning capabilities of such models will likely become powerful tools to support the development of dedicated dialog state trackers and enable dynamic methods.",
+    "authors": [
+      "Michael Heck",
+      "Nurul Lubis",
+      "Benjamin Ruppik",
+      "Renato Vukovic",
+      "Shutong Feng",
+      "Christian Geishauser",
+      "Hsien-chin Lin",
+      "Carel van Niekerk",
+      "Milica Gasic"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.81",
+    "point2d": [
+      2.3154613971710205,
+      71.06754302978516
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 993,
+    "title": "Controllable Mixed-Initiative Dialogue Generation through Prompting",
+    "abstract": "Mixed-initiative dialogue tasks involve repeated exchanges of information and conversational control. Conversational agents gain control by generating responses that follow particular dialogue intents or strategies, prescribed by a policy planner. The standard approach has been fine-tuning pre-trained language models to perform generation conditioned on these intents. However, these supervised generation models are limited by the cost and quality of data annotation.We instead prompt large language models as a drop-in replacement to fine-tuning on conditional generation. We formalize prompt construction for controllable mixed-initiative dialogue. Our findings show improvements over fine-tuning and ground truth responses according to human evaluation and automatic metrics for two tasks: PersuasionForGood and Emotional Support Conversations.",
+    "authors": [
+      "Maximillian Chen",
+      "Xiao Yu",
+      "Weiyan Shi",
+      "Urvi Awasthi",
+      "Zhou Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.82",
+    "point2d": [
+      17.22221565246582,
+      64.64908599853516
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 994,
+    "title": "Enhancing Event Causality Identification with Counterfactual Reasoning",
+    "abstract": "Existing methods for event causality identification (ECI) focus on mining potential causal signals, i.e., causal context keywords and event pairs. However, causal signals are ambiguous, which may lead to the context-keywords bias and the event-pairs bias.To solve this issue, we propose the counterfactual reasoning that explicitly estimates the influence of context keywords and event pairs in training, so that we are able to eliminate the biases in inference.Experiments are conducted on two datasets, the result demonstrates the effectiveness of our method.",
+    "authors": [
+      "Feiteng Mu",
+      "Wenjie Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.83",
+    "point2d": [
+      51.94532775878906,
+      -46.06578826904297
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 995,
+    "title": "Contrastive Bootstrapping for Label Refinement",
+    "abstract": "Traditional text classification typically categorizes texts into pre-defined coarse-grained classes, from which the produced models cannot handle the real-world scenario where finer categories emerge periodically for accurate services. In this work, we investigate the setting where fine-grained classification is done only using the annotation of coarse-grained categories and the coarse-to-fine mapping. We propose a lightweight contrastive clustering-based bootstrapping method to iteratively refine the labels of passages. During clustering, it pulls away negative passage-prototype pairs under the guidance of the mapping from both global and local perspectives. Experiments on NYT and 20News show that our method outperforms the state-of-the-art methods by a large margin.",
+    "authors": [
+      "Shudi Hou",
+      "Yu Xia",
+      "Muhao Chen",
+      "Sujian Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.84",
+    "point2d": [
+      -0.42305564880371094,
+      -23.62589454650879
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 996,
+    "title": "NollySenti: Leveraging Transfer Learning and Machine Translation for Nigerian Movie Sentiment Classification",
+    "abstract": "Africa has over 2000 indigenous languages but they are under-represented in NLP research due to lack of datasets. In recent years, there have been progress in developing labelled corpora for African languages. However, they are often available in a single domain and may not generalize to other domains. In this paper, we focus on the task of sentiment classification for cross-domain adaptation. We create a new dataset, Nollywood movie reviews for five languages widely spoken in Nigeria (English, Hausa, Igbo, Nigerian Pidgin, and Yoruba). We provide an extensive empirical evaluation using classical machine learning methods and pre-trained language models. By leveraging transfer learning, we compare the performance of cross-domain adaptation from Twitter domain, and cross-lingual adaptation from English language. Our evaluation shows that transfer from English in the same target domain leads to more than 5% improvement in accuracy compared to transfer from Twitter in the same language. To further mitigate the domain difference, we leverage machine translation from English to other Nigerian languages, which leads to a further improvement of 7% over cross-lingual evaluation. While machine translation to low-resource languages are often of low quality, our analysis shows that sentiment related words are often preserved.",
+    "authors": [
+      "Iyanuoluwa Shode",
+      "David Ifeoluwa Adelani",
+      "JIng Peng",
+      "Anna Feldman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.85",
+    "point2d": [
+      13.358028411865234,
+      -31.212207794189453
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 997,
+    "title": "Trading Syntax Trees for Wordpieces: Target-oriented Opinion Words Extraction with Wordpieces and Aspect Enhancement",
+    "abstract": "State-of-the-art target-oriented opinion word extraction (TOWE) models typically use BERT-based text encoders that operate on the word level, along with graph convolutional networks (GCNs) that incorporate syntactic information extracted from syntax trees. These methods achieve limited gains with GCNs and have difficulty using BERT wordpieces. Meanwhile, BERT wordpieces are known to be effective at representing rare words or words with insufficient context information. To address this issue, this work trades syntax trees for BERT wordpieces by entirely removing the GCN component from the methods\u2019 architectures. To enhance TOWE performance, we tackle the issue of aspect representation loss during encoding. Instead of solely utilizing a sentence as the input, we use a sentence-aspect pair. Our relatively simple approach achieves state-of-the-art results on benchmark datasets and should serve as a strong baseline for further research.",
+    "authors": [
+      "Samuel Mensah",
+      "Kai Sun",
+      "Nikolaos Aletras"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.86",
+    "point2d": [
+      16.820449829101562,
+      -36.26769256591797
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 998,
+    "title": "An (unhelpful) guide to selecting the best ASR architecture for your under-resourced language",
+    "abstract": "Advances in deep neural models for automatic speech recognition (ASR) have yielded dramatic improvements in ASR quality for resource-rich languages, with English ASR now achieving word error rates comparable to that of human transcribers. The vast majority of the world\u2019s languages, however, lack the quantity of data necessary to approach this level of accuracy. In this paper we use four of the most popular ASR toolkits to train ASR models for eleven languages with limited ASR training resources: eleven widely spoken languages of Africa, Asia, and South America, one endangered language of Central America, and three critically endangered languages of North America. We find that no single architecture consistently outperforms any other. These differences in performance so far do not appear to be related to any particular feature of the datasets or characteristics of the languages. These findings have important implications for future research in ASR for under-resourced languages. ASR systems for languages with abundant existing media and available speakers may derive the most benefit simply by collecting large amounts of additional acoustic and textual training data. Communities using ASR to support endangered language documentation efforts, who cannot easily collect more data, might instead focus on exploring multiple architectures and hyperparameterizations to optimize performance within the constraints of their available data and resources.",
+    "authors": [
+      "Robert Jimerson",
+      "Zoey Liu",
+      "Emily Prud\u2019hommeaux"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.87",
+    "point2d": [
+      -70.48838806152344,
+      13.782743453979492
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 999,
+    "title": "The Ecological Fallacy in Annotation: Modeling Human Label Variation goes beyond Sociodemographics",
+    "abstract": "Many NLP tasks exhibit human label variation, where different annotators give different labels to the same texts. This variation is known to depend, at least in part, on the sociodemographics of annotators. Recent research aims to model individual annotator behaviour rather than predicting aggregated labels, and we would expect that sociodemographic information is useful for these models. On the other hand, the ecological fallacy states that aggregate group behaviour, such as the behaviour of the average female annotator, does not necessarily explain individual behaviour. To account for sociodemographics in models of individual annotator behaviour, we introduce group-specific layers to multi-annotator models. In a series of experiments for toxic content detection, we find that explicitly accounting for sociodemographic attributes in this way does not significantly improve model performance. This result shows that individual annotation behaviour depends on much more than just sociodemographics.",
+    "authors": [
+      "Matthias Orlikowski",
+      "Paul R\u00f6ttger",
+      "Philipp Cimiano",
+      "Dirk Hovy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.88",
+    "point2d": [
+      23.173444747924805,
+      26.14282989501953
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1000,
+    "title": "Decomposed scoring of CCG dependencies",
+    "abstract": "In statistical parsing with CCG, the standard evaluation method is based on predicate-argument structure and evaluates dependencies labelled in part by lexical categories. When a predicate has multiple argument slots that can be filled, the same lexical category is used for the label of multiple dependencies. In this paper, we show that this evaluation can result in disproportionate penalization of supertagging errors and obfuscate the truly erroneous dependencies. Enabled by the compositional nature of CCG lexical categories, we propose *decomposed scoring* based on subcategorial labels to address this.To evaluate our scoring method, we engage fellow categorial grammar researchers in two English-language judgement tasks: (1) directly ranking the outputs of the standard and experimental scoring methods; and (2) determining which of two sentences has the better parse in cases where the two scoring methods disagree on their ranks. Overall, the judges prefer decomposed scoring in each task; but there is substantial disagreement among the judges in 24% of the given cases, pointing to potential issues with parser evaluations in general.",
+    "authors": [
+      "Aditya Bhargava",
+      "Gerald Penn"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.89",
+    "point2d": [
+      -24.290727615356445,
+      -64.7170639038086
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 1001,
+    "title": "Do GPTs Produce Less Literal Translations?",
+    "abstract": "Large Language Models (LLMs) such as GPT-3 have emerged as general-purpose language models capable of addressing many natural language generation or understanding tasks. On the task of Machine Translation (MT), multiple works have investigated few-shot prompting mechanisms to elicit better translations from LLMs. However, there has been relatively little investigation on how such translations differ qualitatively from the translations generated by standard Neural Machine Translation (NMT) models. In this work, we investigate these differences in terms of the literalness of translations produced by the two systems. Using literalness measures involving word alignment and monotonicity, we find that translations out of English (E-X) from GPTs tend to be less literal, while exhibiting similar or better scores on MT quality metrics. We demonstrate that this finding is borne out in human evaluations as well. We then show that these differences are especially pronounced when translating sentences that contain idiomatic expressions.",
+    "authors": [
+      "Vikas Raunak",
+      "Arul Menezes",
+      "Matt Post",
+      "Hany Hassan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.90",
+    "point2d": [
+      -62.299156188964844,
+      -2.6101903915405273
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1002,
+    "title": "Environmental Claim Detection",
+    "abstract": "To transition to a green economy, environmental claims made by companies must be reliable, comparable, and verifiable. To analyze such claims at scale, automated methods are needed to detect them in the first place. However, there exist no datasets or models for this. Thus, this paper introduces the task of environmental claim detection. To accompany the task, we release an expert-annotated dataset and models trained on this dataset. We preview one potential application of such models: We detect environmental claims made in quarterly earning calls and find that the number of environmental claims has steadily increased since the Paris Agreement in 2015.",
+    "authors": [
+      "Dominik Stammbach",
+      "Nicolas Webersinke",
+      "Julia Bingler",
+      "Mathias Kraus",
+      "Markus Leippold"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.91",
+    "point2d": [
+      31.2145938873291,
+      8.582038879394531
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1003,
+    "title": "Black-box language model explanation by context length probing",
+    "abstract": "The increasingly widespread adoption of large language models has highlighted the need for improving their explainability. We present *context length probing*, a novel explanation technique for causal language models, based on tracking the predictions of a model as a function of the length of available context, and allowing to assign *differential importance scores* to different contexts. The technique is model-agnostic and does not rely on access to model internals beyond computing token-level probabilities. We apply context length probing to large pre-trained language models and offer some initial analyses and insights, including the potential for studying long-range dependencies. The [source code](https://github.com/cifkao/context-probing/) and an [interactive demo](https://cifkao.github.io/context-probing/) of the method are available.",
+    "authors": [
+      "Ond\u0159ej C\u00edfka",
+      "Antoine Liutkus"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.92",
+    "point2d": [
+      -47.488216400146484,
+      -32.96102523803711
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 1004,
+    "title": "Let Me Check the Examples: Enhancing Demonstration Learning via Explicit Imitation",
+    "abstract": "Demonstration learning aims to guide the prompt prediction by providing answered demonstrations in the few shot settings. Despite achieving promising results, existing work only concatenates the answered examples as demonstrations to the prompt template (including the raw context) without any additional operation, neglecting the prompt-demonstration dependencies. Besides, prior research found that randomly replacing the labels of demonstrations marginally hurts performance, illustrating that the model could not properly learn the knowledge brought by the demonstrations. Inspired by the human learning process, in this paper, we introduce Imitation DEMOnstration learning (Imitation-Demo) to strengthen demonstration learning via explicitly imitating human review behaviour, which includes: (1) contrastive learning mechanism to concentrate on similar demonstrations.(2) demonstration-label re-prediction method to consolidate known knowledge. Experiment results show that our proposed method achieves state-of-the-art performance on 5 out of 14 classification corpus. Further studies also prove that Imitation-Demo strengthens the associations between the prompt and demonstrations, which could provide the basis for exploring how demonstration learning works.",
+    "authors": [
+      "Sirui Wang",
+      "Kaiwen Wei",
+      "Hongzhi Zhang",
+      "Yuntao Li",
+      "Wei Wu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.93",
+    "point2d": [
+      -10.757761001586914,
+      -17.305208206176758
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 1005,
+    "title": "The Inside Story: Towards Better Understanding of Machine Translation Neural Evaluation Metrics",
+    "abstract": "Neural metrics for machine translation evaluation, such as COMET, exhibit significant improvements in their correlation with human judgments, as compared to traditional metrics based on lexical overlap, such as BLEU. Yet, neural metrics are, to a great extent, \u201cblack boxes\u201d returning a single sentence-level score without transparency about the decision-making process. In this work, we develop and compare several neural explainability methods and demonstrate their effectiveness for interpreting state-of-the-art fine-tuned neural metrics. Our study reveals that these metrics leverage token-level information that can be directly attributed to translation errors, as assessed through comparison of token-level neural saliency maps with Multidimensional Quality Metrics (MQM) annotations and with synthetically-generated critical translation errors. To ease future research, we release our code at: https://github.com/Unbabel/COMET/tree/explainable-metrics",
+    "authors": [
+      "Ricardo Rei",
+      "Nuno M. Guerreiro",
+      "Marcos Treviso",
+      "Luisa Coheur",
+      "Alon Lavie",
+      "Andr\u00e9 Martins"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.94",
+    "point2d": [
+      -74.59010314941406,
+      -4.022735595703125
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1006,
+    "title": "Typo-Robust Representation Learning for Dense Retrieval",
+    "abstract": "Dense retrieval is a basic building block of information retrieval applications. One of the main challenges of dense retrieval in real-world settings is the handling of queries containing misspelled words. A popular approach for handling misspelled queries is minimizing the representations discrepancy between misspelled queries and their pristine ones. Unlike the existing approaches, which only focus on the alignment between misspelled and pristine queries, our method also improves the contrast between each misspelled query and its surrounding queries. To assess the effectiveness of our proposed method, we compare it against the existing competitors using two benchmark datasets and two base encoders. Our method outperforms the competitors in all cases with misspelled queries. Our code and models are available at https://github.com/panuthept/DST-DenseRetrieval.",
+    "authors": [
+      "Panuthep Tasawong",
+      "Wuttikorn Ponwitayarat",
+      "Peerat Limkonchotiwat",
+      "Can Udomcharoenchaikit",
+      "Ekapol Chuangsuwanich",
+      "Sarana Nutanong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.95",
+    "point2d": [
+      13.78093147277832,
+      -13.127473831176758
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1007,
+    "title": "Focused Prefix Tuning for Controllable Text Generation",
+    "abstract": "In a controllable text generation dataset, there exist unannotated attributes that could provide irrelevant learning signals to models that use it for training and thus degrade their performance. We propose focused prefix tuning (FPT) to mitigate the problem and to enable the control to focus on the desired attribute. Experimental results show that FPT can achieve better control accuracy and text fluency than baseline models in single-attribute control tasks. In multi-attribute control tasks, FPT achieves comparable control accuracy with the state-of-the-art approach while keeping the flexibility to control new attributes without retraining existing models.",
+    "authors": [
+      "Congda Ma",
+      "Tianyu Zhao",
+      "Makoto Shing",
+      "Kei Sawada",
+      "Manabu Okumura"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.96",
+    "point2d": [
+      -23.584753036499023,
+      6.590748310089111
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1008,
+    "title": "ReAugKD: Retrieval-Augmented Knowledge Distillation For Pre-trained Language Models",
+    "abstract": "Knowledge Distillation (KD) is one of the most effective approaches to deploying large-scale pre-trained language models in low-latency environments by transferring the knowledge contained in the large-scale models to smaller student models.Prior KD approaches use the soft labels and intermediate activations generated by the teacher to transfer knowledge to the student model parameters alone. In this paper, we show that having access to non-parametric memory in the form of a knowledge base with the teacher\u2019s soft labels and predictions can further improve student generalization. To enable the student to retrieve from the knowledge base effectively, we propose a new framework and loss function that preserves the semantic similarities of teacher and student training examples. We show through extensive experiments that our retrieval mechanism can achieve state-of-the-art performance for task-specific knowledge distillation on the GLUE benchmark.",
+    "authors": [
+      "Jianyi Zhang",
+      "Aashiq Muhamed",
+      "Aditya Anantharaman",
+      "Guoyin Wang",
+      "Changyou Chen",
+      "Kai Zhong",
+      "Qingjun Cui",
+      "Yi Xu",
+      "Belinda Zeng",
+      "Trishul Chilimbi",
+      "Yiran Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.97",
+    "point2d": [
+      -48.69846725463867,
+      -22.311805725097656
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 1009,
+    "title": "Debiasing Generative Named Entity Recognition by Calibrating Sequence Likelihood",
+    "abstract": "Recognizing flat, overlapped and discontinuous entities uniformly has been paid increasing attention.Among these works, Seq2Seq formulation prevails for its flexibility and effectiveness.It arranges the output entities into a specific target sequence.However, it introduces bias by assigning all the probability mass to the observed sequence.To alleviate the bias, previous works either augment the data with possible sequences or resort to other formulations.In this paper, we stick to the Seq2Seq formulation and propose a reranking-based approach.It redistributes the likelihood among candidate sequences depending on their performance via a contrastive loss.Extensive experiments show that our simple yet effective method consistently boosts the baseline, and yields competitive or better results compared with the state-of-the-art methods on 8 widely-used datasets for Named Entity Recognition.",
+    "authors": [
+      "Yu Xia",
+      "Yongwei Zhao",
+      "Wenhao Wu",
+      "Sujian Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.98",
+    "point2d": [
+      34.77544021606445,
+      -85.27980041503906
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1010,
+    "title": "Deriving Language Models from Masked Language Models",
+    "abstract": "Masked language models (MLM) do not explicitly define a distribution over language, i.e., they are not language models per se. However, recent work has implicitly treated them as such for the purposes of generation and scoring. This paper studies methods for deriving explicit joint distributions from MLMs, focusing on distributions over two tokens, which makes it possible to calculate exact distributional properties. We find that an approach based on identifying joints whose conditionals are closest to those of the MLM works well and outperforms existing Markov random field-based approaches. We further find that this derived model\u2019s conditionals can even occasionally outperform the original MLM\u2019s conditionals.",
+    "authors": [
+      "Lucas Torroba Hennigen",
+      "Yoon Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.99",
+    "point2d": [
+      -37.78921127319336,
+      -1.5585389137268066
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1011,
+    "title": "UniTRec: A Unified Text-to-Text Transformer and Joint Contrastive Learning Framework for Text-based Recommendation",
+    "abstract": "Prior study has shown that pretrained language models (PLM) can boost the performance of text-based recommendation. In contrast to previous works that either use PLM to encode user history as a whole input text, or impose an additional aggregation network to fuse multi-turn history representations, we propose a unified local- and global-attention Transformer encoder to better model two-level contexts of user history. Moreover, conditioned on user history encoded by Transformer encoders, our framework leverages Transformer decoders to estimate the language perplexity of candidate text items, which can serve as a straightforward yet significant contrastive signal for user-item text matching. Based on this, our framework, UniTRec, unifies the contrastive objectives of discriminative matching scores and candidate text perplexity to jointly enhance text-based recommendation. Extensive evaluation shows that UniTRec delivers SOTA performance on three text-based recommendation tasks.",
+    "authors": [
+      "Zhiming Mao",
+      "Huimin Wang",
+      "Yiming Du",
+      "Kam-Fai Wong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.100",
+    "point2d": [
+      2.867446184158325,
+      33.78429412841797
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1012,
+    "title": "Reasoning Implicit Sentiment with Chain-of-Thought Prompting",
+    "abstract": "While sentiment analysis systems try to determine the sentiment polarities of given targets based on the key opinion expressions in input texts, in implicit sentiment analysis (ISA) the opinion cues come in an implicit and obscure manner. Thus detecting implicit sentiment requires the common-sense and multi-hop reasoning ability to infer the latent intent of opinion. Inspired by the recent chain-of-thought (CoT) idea, in this work we introduce a Three-hop Reasoning (THOR) CoT framework to mimic the human-like reasoning process for ISA. We design a three-step prompting principle for THOR to step-by-step induce the implicit aspect, opinion, and finally the sentiment polarity. Our THOR+Flan-T5 (11B) pushes the state-of-the-art (SoTA) by over 6% F1 on supervised setup. More strikingly, THOR+GPT3 (175B) boosts the SoTA by over 50% F1 on zero-shot setting.",
+    "authors": [
+      "Hao Fei",
+      "Bobo Li",
+      "Qian Liu",
+      "Lidong Bing",
+      "Fei Li",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.101",
+    "point2d": [
+      18.666996002197266,
+      -33.4122428894043
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 1013,
+    "title": "Latent Positional Information is in the Self-Attention Variance of Transformer Language Models Without Positional Embeddings",
+    "abstract": "The use of positional embeddings in transformer language models is widely accepted. However, recent research has called into question the necessity of such embeddings. We further extend this inquiry by demonstrating that a randomly initialized and frozen transformer language model, devoid of positional embeddings, inherently encodes strong positional information through the shrinkage of self-attention variance. To quantify this variance, we derive the underlying distribution of each step within a transformer layer. Through empirical validation using a fully pretrained model, we show that the variance shrinkage effect still persists after extensive gradient updates. Our findings serve to justify the decision to discard positional embeddings and thus facilitate more efficient pretraining of transformer language models.",
+    "authors": [
+      "Ta-Chung Chi",
+      "Ting-Han Fan",
+      "Li-Wei Chen",
+      "Alexander Rudnicky",
+      "Peter Ramadge"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.102",
+    "point2d": [
+      -40.4284553527832,
+      -29.06642723083496
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 1014,
+    "title": "Is Anisotropy Truly Harmful? A Case Study on Text Clustering",
+    "abstract": "In the last few years, several studies have been devoted to dissecting dense text representations in order to understand their effectiveness and further improve their quality. Particularly, the anisotropy of such representations has been observed, which means that the directions of the word vectors are not evenly distributed across the space but rather concentrated in a narrow cone. This has led to several attempts to counteract this phenomenon both on static and contextualized text representations. However, despite this effort, there is no established relationship between anisotropy and performance. In this paper, we aim to bridge this gap by investigating the impact of different transformations on both the isotropy and the performance in order to assess the true impact of anisotropy. To this end, we rely on the clustering task as a means of evaluating the ability of text representations to produce meaningful groups. Thereby, we empirically show a limited impact of anisotropy on the expressiveness of sentence representations both in terms of directions and L2 closeness.",
+    "authors": [
+      "Mira Ait-Saada",
+      "Mohamed Nadif"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.103",
+    "point2d": [
+      -2.0450565814971924,
+      -28.982397079467773
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1015,
+    "title": "Class based Influence Functions for Error Detection",
+    "abstract": "Influence functions (IFs) are a powerful tool for detecting anomalous examples in large scale datasets.However, they are unstable when applied to deep networks.In this paper, we provide an explanation for the instability of IFs and develop a solution to this problem.We show that IFs are unreliable when the two data points belong to two different classes.Our solution leverages class information to improve the stability of IFs.Extensive experiments show that our modification significantly improves the performance and stability of IFs while incurring no additional computational cost.",
+    "authors": [
+      "Thang Nguyen-Duc",
+      "Hoang Thanh-Tung",
+      "Quan Hung Tran",
+      "Dang Huu-Tien",
+      "Hieu Nguyen",
+      "Anh T. V. Dau",
+      "Nghi Bui"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.104",
+    "point2d": [
+      0.23974481225013733,
+      -0.3687570095062256
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1016,
+    "title": "Leveraging Prefix Transfer for Multi-Intent Text Revision",
+    "abstract": "Text revision is a necessary process to improve text quality. During this process, writers constantly edit texts out of different edit intentions.Identifying edit intention for a raw text is always an ambiguous work, and most previous work on revision systems mainly focuses on editing texts according to one specific edit intention.In this work, we aim to build a multi-intent text revision system that could revise texts without explicit intent annotation.Our system is based on prefix-tuning, which first gets prefixes for every edit intent, and then trains a prefix transfer module, enabling the system to selectively leverage the knowledge from various prefixes according to the input text.We conduct experiments on the IteraTeR dataset, and the results show that our system outperforms baselines. The system can significantly improve the SARI score with more than 3% improvements, which thrives on the learned editing intention prefixes.",
+    "authors": [
+      "Ruining Chong",
+      "Cunliang Kong",
+      "Liu Wu",
+      "Zhenghao Liu",
+      "Ziye Jin",
+      "Liner Yang",
+      "Yange Fan",
+      "Hanghang Fan",
+      "Erhong Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.105",
+    "point2d": [
+      -28.446334838867188,
+      30.995187759399414
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1017,
+    "title": "Learning Multi-Step Reasoning by Solving Arithmetic Tasks",
+    "abstract": "Mathematical reasoning is regarded as a necessary ability for Language Models (LMs). Recent works demonstrate large LMs\u2019 impressive performance in solving math problems. The success is attributed to their Chain-of-Thought (CoT) reasoning abilities, i.e., the ability to decompose complex questions into step-by-step reasoning chains, but such ability seems only to emerge from models with abundant parameters. This work investigates how to incorporate relatively small LMs with the capabilities of multi-step reasoning. We propose to inject such abilities by continually pre-training LMs on a synthetic dataset MsAT which is composed of Multi-step Arithmetic Tasks. Our experiments on four math word problem datasets show the effectiveness of the proposed method in enhancing LMs\u2019 math reasoning abilities.",
+    "authors": [
+      "Tianduo Wang",
+      "Wei Lu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.106",
+    "point2d": [
+      44.491485595703125,
+      -19.235309600830078
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 1018,
+    "title": "Towards Adaptive Prefix Tuning for Parameter-Efficient Language Model Fine-tuning",
+    "abstract": "Fine-tuning large pre-trained language models on various downstream tasks with whole parameters is prohibitively expensive. Hence, Parameter-efficient fine-tuning has attracted attention that only optimizes a few task-specific parameters with the frozen pre-trained model. In this work, we focus on prefix tuning, which only optimizes continuous prefix vectors (i.e. pseudo tokens) inserted into Transformer layers. Based on the observation that the learned syntax and semantics representation varies a lot at different layers, we argue that the adaptive prefix will be further tailored to each layer than the fixed one, enabling the fine-tuning more effective and efficient. Thus, we propose Adaptive Prefix Tuning (APT) to adjust the prefix in terms of both fine-grained token level and coarse-grained layer level with a gate mechanism. Experiments on the SuperGLUE and NER datasets show the effectiveness of APT. In addition, taking the gate as a probing, we validate the efficiency and effectiveness of the variable prefix.",
+    "authors": [
+      "Zhen-Ru Zhang",
+      "Chuanqi Tan",
+      "Haiyang Xu",
+      "Chengyu Wang",
+      "Jun Huang",
+      "Songfang Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.107",
+    "point2d": [
+      -32.87663269042969,
+      -22.43700408935547
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1019,
+    "title": "Improving Gender Fairness of Pre-Trained Language Models without Catastrophic Forgetting",
+    "abstract": "Existing studies addressing gender bias of pre-trained language models, usually build a small gender-neutral data set and conduct a second phase pre-training on the model with such data. However, given the limited size and concentrated focus of the gender-neutral data, catastrophic forgetting would occur during second-phase pre-training. Forgetting information in the original training data may damage the model\u2019s downstream performance by a large margin. In this work, we empirically show that catastrophic forgetting occurs in such methods by evaluating them with general NLP tasks in GLUE. Then, we propose a new method, GEnder Equality Prompt (GEEP), to improve gender fairness of pre-trained models with less forgetting. GEEP freezes the pre-trained model and learns gender-related prompts with gender-neutral data.Empirical results show that GEEP not only achieves SOTA performances on gender fairness tasks, but also forgets less and performs better on GLUE by a large margin.",
+    "authors": [
+      "Zahra Fatemi",
+      "Chen Xing",
+      "Wenhao Liu",
+      "Caimming Xiong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.108",
+    "point2d": [
+      12.26727294921875,
+      29.445024490356445
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1020,
+    "title": "Class-Incremental Learning based on Label Generation",
+    "abstract": "Despite the great success of pre-trained language models, it is still a challenge to use these models for continual learning, especially for the class-incremental learning (CIL) setting due to catastrophic forgetting (CF). This paper reports our finding that if we formulate CIL as a continual label generation problem, CF is drastically reduced and the generalizable representations of pre-trained models can be better retained. We thus propose a new CIL method (VAG) that also leverages the sparsity of vocabulary to focus the generation and creates pseudo-replay samples by using label semantics. Experimental results show that VAG outperforms baselines by a large margin.",
+    "authors": [
+      "Yijia Shao",
+      "Yiduo Guo",
+      "Dongyan Zhao",
+      "Bing Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.109",
+    "point2d": [
+      -12.612902641296387,
+      -14.641129493713379
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 1021,
+    "title": "Evaluating pragmatic abilities of image captioners on A3DS",
+    "abstract": "Evaluating grounded neural language model performance with respect to pragmatic qualities like the trade off between truthfulness, contrastivity and overinformativity of generated utterances remains a challenge in absence of data collected from humans.To enable such evaluation, we present a novel open source image-text dataset \u201cAnnotated 3D Shapes\u201d (A3DS) comprising over nine million exhaustive natural language annotations and over 12 million variable-granularity captions for the 480,000 images provided by Burgess & Kim (2018).We showcase the evaluation of pragmatic abilities developed by a task-neutral image captioner fine-tuned in a multi-agent communication setting to produce contrastive captions. The evaluation is enabled by the dataset because the exhaustive annotations allow to quantify the presence of contrastive features in the model\u2019s generations. We show that the model develops human-like patterns (informativity, brevity, over-informativity for specific features (e.g., shape, color biases)).",
+    "authors": [
+      "Polina Tsvilodub",
+      "Michael Franke"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.110",
+    "point2d": [
+      -60.424434661865234,
+      47.275360107421875
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1022,
+    "title": "The Art of Prompting: Event Detection based on Type Specific Prompts",
+    "abstract": "We compare various forms of prompts to represent event types and develop a unified framework to incorporate the event type specific prompts for supervised, few-shot, and zero-shot event detection. The experimental results demonstrate that a well-defined and comprehensive event type prompt can significantly improve event detection performance, especially when the annotated data is scarce (few-shot event detection) or not available (zero-shot event detection). By leveraging the semantics of event types, our unified framework shows up to 22.2% F-score gain over the previous state-of-the-art baselines.",
+    "authors": [
+      "Sijia Wang",
+      "Mo Yu",
+      "Lifu Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.111",
+    "point2d": [
+      52.382789611816406,
+      -37.51914596557617
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 1023,
+    "title": "Exploring the Impact of Layer Normalization for Zero-shot Neural Machine Translation",
+    "abstract": "This paper studies the impact of layer normalization (LayerNorm) on zero-shot translation (ZST). Recent efforts for ZST often utilize the Transformer architecture as the backbone, with LayerNorm at the input of layers (PreNorm) set as the default. However, Xu et al. (2019) has revealed that PreNorm carries the risk of overfitting the training data. Based on this, we hypothesize that PreNorm may overfit supervised directions and thus have low generalizability for ZST. Through experiments on OPUS, IWSLT, and Europarl datasets for 54 ZST directions, we demonstrate that the original Transformer setting of LayerNorm after residual connections (PostNorm) consistently outperforms PreNorm by up to 12.3 BLEU points. We then study the performance disparities by analyzing the differences in off-target rates and structural variations between PreNorm and PostNorm. This study highlights the need for careful consideration of the LayerNorm setting for ZST.",
+    "authors": [
+      "Zhuoyuan Mao",
+      "Raj Dabre",
+      "Qianying Liu",
+      "Haiyue Song",
+      "Chenhui Chu",
+      "Sadao Kurohashi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.112",
+    "point2d": [
+      -64.96031951904297,
+      -11.13650894165039
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1024,
+    "title": "Do Models Really Learn to Follow Instructions? An Empirical Study of Instruction Tuning",
+    "abstract": "Recent works on instruction tuning (IT) have achieved great performance with zero-shot generalizability to unseen tasks. With additional context (e.g., task definition, examples) provided to models for fine-tuning, they achieved much higher performance than untuned models. Despite impressive performance gains, what models learn from IT remains understudied. In this work, we analyze how models utilize instructions during IT by comparing model training with altered vs. original instructions. Specifically, we create simplified task definitions by removing all semantic components and only leaving the output space information, and delusive examples that contain incorrect input-output mapping. Our experiments show that models trained on simplified task definition or delusive examples can achieve comparable performance to the ones trained on the original instructions and examples. Furthermore, we introduce a random baseline to perform zeroshot classification tasks, and find it achieves similar performance (42.6% exact-match) as IT does (43% exact-match) in low resource setting, while both methods outperform naive T5 significantly (30% per exact-match). Our analysis provides evidence that the impressive performance gain of current IT models can come from picking up superficial patterns, such as learning the output format and guessing. Our study highlights the urgent need for more reliable IT methods and evaluation.",
+    "authors": [
+      "Po-Nien Kung",
+      "Nanyun Peng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.113",
+    "point2d": [
+      -20.34250831604004,
+      -19.224132537841797
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 1025,
+    "title": "Self-Distilled Quantization: Achieving High Compression Rates in Transformer-Based Language Models",
+    "abstract": "We investigate the effects of post-training quantization and quantization-aware training on the generalization of Transformer language models. We present a new method called self-distilled quantization (SDQ) that minimizes accumulative quantization errors and outperforms baselines. We apply SDQ to multilingual models XLM-R_{\\text{Base}} and InfoXLM_{\\text{Base}} and demonstrate that both models can be reduced from 32-bit floating point weights to 8-bit integer weights while maintaining a high level of performance on the XGLUE benchmark. Our results also highlight the challenges of quantizing multilingual models, which must generalize to languages they were not fine-tuned on.",
+    "authors": [
+      "James O\u2019Neill",
+      "Sourav Dutta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.114",
+    "point2d": [
+      -51.52350997924805,
+      -12.731191635131836
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1026,
+    "title": "Modality Adaption or Regularization? A Case Study on End-to-End Speech Translation",
+    "abstract": "Pre-training and fine-tuning is a paradigm for alleviating the data scarcity problem in end-to-end speech translation (E2E ST). The commonplace \u201dmodality gap\u201d between speech and text data often leads to inconsistent inputs between pre-training and fine-tuning. However, we observe that this gap occurs in the early stages of fine-tuning, but does not have a major impact on the final performance. On the other hand, we find that there has another gap, which we call the \u201dcapacity gap\u201d: high resource tasks (such as ASR and MT) always require a large model to fit, when the model is reused for a low resource task (E2E ST), it will get a sub-optimal performance due to the over-fitting. In a case study, we find that the regularization plays a more important role than the well-designed modality adaption method, which achieves 29.0 for en-de and 40.3 for en-fr on the MuST-C dataset.",
+    "authors": [
+      "Yuchen Han",
+      "Chen Xu",
+      "Tong Xiao",
+      "Jingbo Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.115",
+    "point2d": [
+      -69.35824584960938,
+      21.117773056030273
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 1027,
+    "title": "Uncertainty-Aware Bootstrap Learning for Joint Extraction on Distantly-Supervised Data",
+    "abstract": "Jointly extracting entity pairs and their relations is challenging when working on distantly-supervised data with ambiguous or noisy labels.To mitigate such impact, we propose uncertainty-aware bootstrap learning, which is motivated by the intuition that the higher uncertainty of an instance, the more likely the model confidence is inconsistent with the ground truths.Specifically, we first explore instance-level data uncertainty to create an initial high-confident examples. Such subset serves as filtering noisy instances and facilitating the model to converge fast at the early stage.During bootstrap learning, we propose self-ensembling as a regularizer to alleviate inter-model uncertainty produced by noisy labels. We further define probability variance of joint tagging probabilities to estimate inner-model parametric uncertainty, which is used to select and build up new reliable training instances for the next iteration.Experimental results on two large datasets reveal that our approach outperforms existing strong baselines and related methods.",
+    "authors": [
+      "Yufei Li",
+      "Xiao Yu",
+      "Yanchi Liu",
+      "Haifeng Chen",
+      "Cong Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.116",
+    "point2d": [
+      38.831363677978516,
+      -67.35757446289062
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 1028,
+    "title": "Text-to-SQL Error Correction with Language Models of Code",
+    "abstract": "Despite recent progress in text-to-SQL parsing, current semantic parsers are still not accurate enough for practical use. In this paper, we investigate how to build automatic text-to-SQL error correction models. Noticing that token-level edits are out of context and sometimes ambiguous, we propose building clause-level edit models instead. Besides, while most language models of code are not specifically pre-trained for SQL, they know common data structures and their operations in programming languages such as Python. Thus, we propose a novel representation for SQL queries and their edits that adheres more closely to the pre-training corpora of language models of code. Our error correction model improves the exact set match accuracy of different parsers by 2.4-6.5 and obtains up to 4.3 point absolute improvement over two strong baselines.",
+    "authors": [
+      "Ziru Chen",
+      "Shijie Chen",
+      "Michael White",
+      "Raymond Mooney",
+      "Ali Payani",
+      "Jayanth Srinivasa",
+      "Yu Su",
+      "Huan Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.117",
+    "point2d": [
+      -10.672414779663086,
+      -54.84150314331055
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 1029,
+    "title": "The Tail Wagging the Dog: Dataset Construction Biases of Social Bias Benchmarks",
+    "abstract": "How reliably can we trust the scores obtained from social bias benchmarks as faithful indicators of problematic social biases in a given model? In this work, we study this question by contrasting social biases with non-social biases that stem from choices made during dataset construction (which might not even be discernible to the human eye). To do so, we empirically simulate various alternative constructions for a given benchmark based on seemingly innocuous modifications (such as paraphrasing or random-sampling) that maintain the essence of their social bias. On two well-known social bias benchmarks (Winogender and BiasNLI), we observe that these shallow modifications have a surprising effect on the resulting degree of bias across various models and consequently the relative ordering of these models when ranked by measured bias. We hope these troubling observations motivate more robust measures of social biases.",
+    "authors": [
+      "Nikil Selvam",
+      "Sunipa Dev",
+      "Daniel Khashabi",
+      "Tushar Khot",
+      "Kai-Wei Chang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.118",
+    "point2d": [
+      21.736387252807617,
+      30.15740394592285
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1030,
+    "title": "Summarizing, Simplifying, and Synthesizing Medical Evidence using GPT-3 (with Varying Success)",
+    "abstract": "Large language models, particularly GPT-3, are able to produce high quality summaries ofgeneral domain news articles in few- and zero-shot settings. However, it is unclear if such models are similarly capable in more specialized domains such as biomedicine. In this paper we enlist domain experts (individuals with medical training) to evaluate summaries of biomedical articles generated by GPT-3, given no supervision. We consider bothsingle- and multi-document settings. In the former, GPT-3 is tasked with generating regular and plain-language summaries of articles describing randomized controlled trials; in thelatter, we assess the degree to which GPT-3 is able to synthesize evidence reported acrossa collection of articles. We design an annotation scheme for evaluating model outputs, withan emphasis on assessing the factual accuracy of generated summaries. We find that whileGPT-3 is able to summarize and simplify single biomedical articles faithfully, it strugglesto provide accurate aggregations of findings over multiple documents. We release all data,code, and annotations used in this work.",
+    "authors": [
+      "Chantal Shaib",
+      "Millicent Li",
+      "Sebastian Joseph",
+      "Iain Marshall",
+      "Junyi Jessy Li",
+      "Byron Wallace"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.119",
+    "point2d": [
+      -5.630923271179199,
+      32.9780387878418
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1031,
+    "title": "Prefix Propagation: Parameter-Efficient Tuning for Long Sequences",
+    "abstract": "Parameter-efficient tuning aims to mitigate the large memory requirements of adapting pretrained language models for downstream tasks. For example, one popular method, prefix-tuning, prepends trainable tokens to sequences while freezing the rest of the model\u2019s parameters. Although such models attain comparable performance with fine-tuning when applied to sequences with short to moderate lengths, we show their inferior performance when modelling long sequences. To bridge this gap, we propose prefix-propagation, a simple but effective approach that conditions prefixes on previous hidden states. We empirically demonstrate that prefix-propagation outperforms prefix-tuning across long-document tasks, while using 50% fewer parameters. To further investigate the proposed architecture, we also show its advantage in calibration, and perform additional study on its relationship with kernel attention. To the best of our knowledge, this work is the first to focus on parameter-efficient learning for long-sequence language tasks.",
+    "authors": [
+      "Jonathan Li",
+      "Will Aitken",
+      "Rohan Bhambhoria",
+      "Xiaodan Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.120",
+    "point2d": [
+      -34.151611328125,
+      -22.238067626953125
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1032,
+    "title": "Listener Model for the PhotoBook Referential Game with CLIPScores as Implicit Reference Chain",
+    "abstract": "PhotoBook is a collaborative dialogue game where two players receive private, partially-overlapping sets of images and resolve which images they have in common.It presents machines with a great challenge to learn how people build common ground around multimodal context to communicate effectively.Methods developed in the literature, however, cannot be deployed to real gameplaysince they only tackle some subtasks of the game,and they require additional reference chains inputs, whose extraction process is imperfect.Therefore, we propose a reference chain-free listener modelthat directly addresses the game\u2019s predictive task, i.e., deciding whether an image is shared with partner.Our DeBERTa-based listener model reads the full dialogue, and utilizesCLIPScore features to assess utterance-image relevance.We achieve >77% accuracy on unseen sets of images/game themes, outperforming baseline by >17 points.",
+    "authors": [
+      "Shih-Lun Wu",
+      "Yi-Hui Chou",
+      "Liangze Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.121",
+    "point2d": [
+      7.849738121032715,
+      74.7378921508789
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 1033,
+    "title": "Bring More Attention to Syntactic Symmetry for Automatic Postediting of High-Quality Machine Translations",
+    "abstract": "Automatic postediting (APE) is an automated process to refine a given machine translation (MT).Recent findings present that existing APE systems are not good at handling high-quality MTs even for a language pair with abundant data resources, English\u2013German: the better the given MT is, the harder it is to decide what parts to edit and how to fix these errors.One possible solution to this problem is to instill deeper knowledge about the target language into the model.Thus, we propose a linguistically motivated method of regularization that is expected to enhance APE models\u2019 understanding of the target language: a loss function that encourages symmetric self-attention on the given MT.Our analysis of experimental results demonstrates that the proposed method helps improving the state-of-the-art architecture\u2019s APE quality for high-quality MTs.",
+    "authors": [
+      "Baikjin Jung",
+      "Myungji Lee",
+      "Jong-Hyeok Lee",
+      "Yunsu Kim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.122",
+    "point2d": [
+      -68.8497314453125,
+      -9.148709297180176
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1034,
+    "title": "An Embarrassingly Easy but Strong Baseline for Nested Named Entity Recognition",
+    "abstract": "Named entity recognition (NER) is the task to detect and classify entity spans in the text. When entity spans overlap between each other, the task is named as nested NER. Span-based methods have been widely used to tackle nested NER. Most of these methods get a score matrix, where each entry corresponds to a span. However, previous work ignores spatial relations in the score matrix. In this paper, we propose using Convolutional Neural Network (CNN) to model these spatial relations. Despite being simple, experiments in three commonly used nested NER datasets show that our model surpasses several recently proposed methods with the same pre-trained encoders. Further analysis shows that using CNN can help the model find more nested entities. Besides, we find that different papers use different sentence tokenizations for the three nested NER datasets, which will influence the comparison. Thus, we release a pre-processing script to facilitate future comparison.",
+    "authors": [
+      "Hang Yan",
+      "Yu Sun",
+      "Xiaonan Li",
+      "Xipeng Qiu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.123",
+    "point2d": [
+      30.520706176757812,
+      -80.94038391113281
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1035,
+    "title": "Hexatagging: Projective Dependency Parsing as Tagging",
+    "abstract": "We introduce a novel dependency parser, the hexatagger, that constructs dependency trees by tagging the words in a sentence with elements from a finite set of possible tags. In contrast to many approaches to dependency parsing, our approach is fully parallelizable at training time, i.e., the structure-building actions needed to build a dependency parse can be predicted in parallel to each other. Additionally, exact decoding is linear in time and space complexity. Furthermore, we derive a probabilistic dependency parser that predicts hexatags using no more than a linear model with features from a pretrained language model, i.e., we forsake a bespoke architecture explicitly designed for the task. Despite the generality and simplicity of our approach, we achieve state-of-the-art performance of 96.4 LAS and 97.4 UAS on the Penn Treebank test set. Additionally, our parser\u2019s linear time complexity and parallelism significantly improve computational efficiency, with a roughly 10-times speed-up over previous state-of-the-art models during decoding.",
+    "authors": [
+      "Afra Amini",
+      "Tianyu Liu",
+      "Ryan Cotterell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.124",
+    "point2d": [
+      -24.946475982666016,
+      -61.84856033325195
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 1036,
+    "title": "Understanding Demonstration-based Learning from a Causal Perspective",
+    "abstract": "Demonstration-based learning has shown impressive performance in exploiting pretrained language models under few-shot learning settings. It is interesting to see that demonstrations, even those composed of random tokens, can still improve performance. In this paper, we build a Structural Causal Model (SCM) to understand demonstration-based learning from causal perspectives and interpret random demonstrations as interventions on the demonstration variable within the causal model. We investigate the causal effects and find that the concurrence of specific words in the demonstration will induce bias, while randomly sampled tokens in the demonstration do not. Based on this finding, we further propose simple ways to construct random demonstrations, which even outperform hand-crafted, meaningful demonstrations on public sequence labeling benchmarks.",
+    "authors": [
+      "Ruiyi Zhang",
+      "Tong Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.125",
+    "point2d": [
+      -10.54389762878418,
+      -17.59362030029297
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 1037,
+    "title": "RAMP: Retrieval and Attribute-Marking Enhanced Prompting for Attribute-Controlled Translation",
+    "abstract": "Attribute-controlled translation (ACT) is a subtask of machine translation that involves controlling stylistic or linguistic attributes (like formality and gender) of translation outputs. While ACT has garnered attention in recent years due to its usefulness in real-world applications, progress in the task is currently limited by dataset availability, since most prior approaches rely on supervised methods. To address this limitation, we propose Retrieval and Attribute-Marking enhanced Prompting (RAMP), which leverages large multilingual language models to perform ACT in few-shot and zero-shot settings. RAMP improves generation accuracy over the standard prompting approach by (1) incorporating a semantic similarity retrieval component for selecting similar in-context examples, and (2) marking in-context examples with attribute annotations. Our comprehensive experiments show that RAMP is a viable approach in both zero-shot and few-shot settings.",
+    "authors": [
+      "Gabriele Sarti",
+      "Phu Mon Htut",
+      "Xing Niu",
+      "Benjamin Hsu",
+      "Anna Currey",
+      "Georgiana Dinu",
+      "Maria Nadejde"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.126",
+    "point2d": [
+      -61.139892578125,
+      -0.37480801343917847
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1038,
+    "title": "Zero-Shot and Few-Shot Stance Detection on Varied Topics via Conditional Generation",
+    "abstract": "Zero-shot and few-shot stance detection identify the polarity of text with regard to a certain target when we have only limited or no training resources for the target. Previous work generally formulates the problem into a classification setting, ignoring the potential use of label text. In this paper, we instead utilize a conditional generation framework and formulate the problem as denoising from partially-filled templates, which can better utilize the semantics among input, label, and target texts. We further propose to jointly train an auxiliary task, target prediction, and to incorporate manually constructed incorrect samples with unlikelihood training to improve the representations for both target and label texts. We also verify the effectiveness of target-related Wikipedia knowledge with the generation framework. Experiments show that our proposed method significantly outperforms several strong baselines on VAST, and achieves new state-of-the-art performance.",
+    "authors": [
+      "Haoyang Wen",
+      "Alexander Hauptmann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.127",
+    "point2d": [
+      50.56585693359375,
+      38.83431625366211
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 1039,
+    "title": "Discourse-Level Representations can Improve Prediction of Degree of Anxiety",
+    "abstract": "Anxiety disorders are the most common of mental illnesses, but relatively little is known about how to detect them from language. The primary clinical manifestation of anxiety is worry associated cognitive distortions, which are likely expressed at the discourse-level of semantics. Here, we investigate the development of a modern linguistic assessment for degree of anxiety, specifically evaluating the utility of discourse-level information in addition to lexical-level large language model embeddings. We find that a combined lexico-discourse model outperforms models based solely on state-of-the-art contextual embeddings (RoBERTa), with discourse-level representations derived from Sentence-BERT and DiscRE both providing additional predictive power not captured by lexical-level representations. Interpreting the model, we find that discourse patterns of causal explanations, among others, were used significantly more by those scoring high in anxiety, dovetailing with psychological literature.",
+    "authors": [
+      "Swanie Juhng",
+      "Matthew Matero",
+      "Vasudha Varadarajan",
+      "Johannes Eichstaedt",
+      "Adithya V Ganesan",
+      "H. Andrew Schwartz"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.128",
+    "point2d": [
+      35.05341339111328,
+      72.6593017578125
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1040,
+    "title": "Controlling the Extraction of Memorized Data from Large Language Models via Prompt-Tuning",
+    "abstract": "Large Language Models (LLMs) are known to memorize significant portions of their training data. Parts of this memorized content have been shown to be extractable by simply querying the model, which poses a privacy risk. We present a novel approach which uses prompt-tuning to control the extraction rates of memorized content in LLMs. We present two prompt training strategies to increase and decrease extraction rates, which correspond to an attack and a defense, respectively. We demonstrate the effectiveness of our techniques by using models from the GPT-Neo family on a public benchmark. For the 1.3B parameter GPT-Neo model, our attack yields a 9.3 percentage point increase in extraction rate compared to our baseline. Our defense can be tuned to achieve different privacy-utility trade-offs by a user-specified hyperparameter. We achieve an extraction rate reduction of up to 97.7% relative to our baseline, with a perplexity increase of 16.9%.",
+    "authors": [
+      "Mustafa Ozdayi",
+      "Charith Peris",
+      "Jack FitzGerald",
+      "Christophe Dupuy",
+      "Jimit Majmudar",
+      "Haidar Khan",
+      "Rahil Parikh",
+      "Rahul Gupta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.129",
+    "point2d": [
+      -2.332594156265259,
+      12.618293762207031
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 1041,
+    "title": "MultiTool-CoT: GPT-3 Can Use Multiple External Tools with Chain of Thought Prompting",
+    "abstract": "Large language models (LLMs) have achieved impressive performance on various reasoning tasks. To further improve the performance, we propose MultiTool-CoT, a novel framework that leverages chain-of-thought (CoT) prompting to incorporate multiple external tools, such as a calculator and a knowledge retriever, during the reasoning process.We apply MultiTool-CoT to the Task 2 dataset of NumGLUE, which requires both numerical reasoning and domain-specific knowledge.The experiments show that our method significantly outperforms strong baselines and achieves state-of-the-art performance.",
+    "authors": [
+      "Tatsuro Inaba",
+      "Hirokazu Kiyomaru",
+      "Fei Cheng",
+      "Sadao Kurohashi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.130",
+    "point2d": [
+      45.358646392822266,
+      -14.485353469848633
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 1042,
+    "title": "mPMR: A Multilingual Pre-trained Machine Reader at Scale",
+    "abstract": "We present multilingual Pre-trained Machine Reader (mPMR), a novel method for multilingual machine reading comprehension (MRC)-style pre-training. mPMR aims to guide multilingual pre-trained language models (mPLMs) to perform natural language understanding (NLU) including both sequence classification and span extraction in multiple languages. To achieve cross-lingual generalization when only source-language fine-tuning data is available, existing mPLMs solely transfer NLU capability from a source language to target languages. In contrast, mPMR allows the direct inheritance of multilingual NLU capability from the MRC-style pre-training to downstream tasks. Therefore, mPMR acquires better NLU capability for target languages. mPMR also provides a unified solver for tackling cross-lingual span extraction and sequence classification, thereby enabling the extraction of rationales to explain the sentence-pair classification process.",
+    "authors": [
+      "Weiwen Xu",
+      "Xin Li",
+      "Wai Lam",
+      "Lidong Bing"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.131",
+    "point2d": [
+      -24.572357177734375,
+      -34.9943733215332
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1043,
+    "title": "MOSPC: MOS Prediction Based on Pairwise Comparison",
+    "abstract": "As a subjective metric to evaluate the quality of synthesized speech, Mean opinion score(MOS) usually requires multiple annotators to score the same speech. Such an annotation approach requires a lot of manpower and is also time-consuming. MOS prediction model for automatic evaluation can significantly reduce labor cost. In previous works, it is difficult to accurately rank the quality of speech when the MOS scores are close. However, in practical applications, it is more important to correctly rank the quality of synthesis systems or sentences than simply predicting MOS scores. Meanwhile, as each annotator scores multiple audios during annotation, the score is probably a relative value based on the first or the first few speech scores given by the annotator. Motivated by the above two points, we propose a general framework for MOS prediction based on pair comparison (MOSPC), and we utilize C-Mixup algorithm to enhance the generalization performance of MOSPC.The experiments on BVCC and VCC2018 show that our framework outperforms the baselines on most of the correlation coefficient metrics, especially on the metric KTAU related to quality ranking. And our framework also surpasses the strong baseline in ranking accuracy on each fine-grained segment. These results indicate that our framework contributes to improving the ranking accuracy of speech quality.",
+    "authors": [
+      "Kexin Wang",
+      "Yunlong Zhao",
+      "Qianqian Dong",
+      "Tom Ko",
+      "Mingxuan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.132",
+    "point2d": [
+      -72.75785827636719,
+      20.616291046142578
+    ],
+    "cluster": 29.0
+  },
+  {
+    "idx": 1044,
+    "title": "LI-RAGE: Late Interaction Retrieval Augmented Generation with Explicit Signals for Open-Domain Table Question Answering",
+    "abstract": "Recent open-domain TableQA models are typically implemented as retriever-reader pipelines. The retriever component is usually a variant of the Dense Passage Retriever, which computes the similarities between questions and tables based on a single representation of each.These fixed vectors can be insufficient to capture fine-grained features of potentially very big tables with heterogeneous row/column information. We address this limitation by 1) applying late interaction models which enforce a finer-grained interaction between question and table embeddings at retrieval time. In addition, we 2) incorporate a joint training scheme of the retriever and reader with explicit table-level signals, and 3) embed a binary relevance token as a prefix to the answer generated by the reader, so we can determine at inference time whether the table used to answer the question is reliable and filter accordingly. The combined strategies set a new state-to-the-art performance on two public open-domain TableQA datasets.",
+    "authors": [
+      "Weizhe Lin",
+      "Rexhina Blloshmi",
+      "Bill Byrne",
+      "Adria de Gispert",
+      "Gonzalo Iglesias"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.133",
+    "point2d": [
+      77.10781860351562,
+      7.719989776611328
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1045,
+    "title": "How Well Apply Simple MLP to Incomplete Utterance Rewriting?",
+    "abstract": "Incomplete utterance rewriting (IUR) aims to restore the incomplete utterance with sufficient context information for comprehension. This paper introduces a simple yet efficient IUR method. Different from prior studies, we first employ only one-layer MLP architecture to mine latent semantic information between joint utterances for IUR task (MIUR). After that, we conduct a joint feature matrix to predict the token type and thus restore the incomplete utterance. The well-designed network and simple architecture make our method significantly superior to existing methods in terms of quality and inference speedOur code is available at https://github.com/IMU-MachineLearningSXD/MIUR.",
+    "authors": [
+      "Jiang Li",
+      "Xiangdong Su",
+      "Xinlan Ma",
+      "Guanglai Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.134",
+    "point2d": [
+      -5.089517116546631,
+      62.153594970703125
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1046,
+    "title": "XL-LEXEME: WiC Pretrained Model for Cross-Lingual LEXical sEMantic changE",
+    "abstract": "The recent introduction of large-scale datasets for the WiC (Word in Context) task enables the creation of more reliable and meaningful contextualized word embeddings.However, most of the approaches to the WiC task use cross-encoders, which prevent the possibility of deriving comparable word embeddings.In this work, we introduce XL-LEXEME, a Lexical Semantic Change Detection model.XL-LEXEME extends SBERT, highlighting the target word in the sentence.We evaluate XL-LEXEME on the multilingual benchmarks for SemEval-2020 Task 1 - Lexical Semantic Change (LSC) Detection and the RuShiftEval shared task involving five languages: English, German, Swedish, Latin, and Russian.XL-LEXEME outperforms the state-of-the-art in English, German and Swedish with statistically significant differences from the baseline results and obtains state-of-the-art performance in the RuShiftEval shared task.",
+    "authors": [
+      "Pierluigi Cassotti",
+      "Lucia Siciliani",
+      "Marco DeGemmis",
+      "Giovanni Semeraro",
+      "Pierpaolo Basile"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.135",
+    "point2d": [
+      3.627147674560547,
+      -44.36414337158203
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 1047,
+    "title": "Theory-Grounded Computational Text Analysis",
+    "abstract": "In this position paper, we argue that computational text analysis lacks and requires organizing principles. A broad space separates its two constituent disciplines\u2014natural language processing and social science\u2014which has to date been sidestepped rather than filled by applying increasingly complex computational models to problems in social science research. We contrast descriptive and integrative findings, and our review of approximately 60 papers on computational text analysis reveals that those from *ACL venues are typically descriptive. The lack of theory began at the area\u2019s inception and has over the decades, grown more important and challenging. A return to theoretically grounded research questions will propel the area from both theoretical and methodological points of view.",
+    "authors": [
+      "Arya D. McCarthy",
+      "Giovanna Maria Dora Dore"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.136",
+    "point2d": [
+      34.28506088256836,
+      28.928512573242188
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1048,
+    "title": "AMRs Assemble! Learning to Ensemble with Autoregressive Models for AMR Parsing",
+    "abstract": "In this paper, we examine the current state-of-the-art in AMR parsing, which relies on ensemble strategies by merging multiple graph predictions. Our analysis reveals that the present models often violate AMR structural constraints. To address this issue, we develop a validation method, and show how ensemble models can exploit SMATCH metric weaknesses to obtain higher scores, but sometimes result in corrupted graphs. Additionally, we highlight the demanding need to compute the SMATCH score among all possible predictions. To overcome these challenges, we propose two novel ensemble strategies based on Transformer models, improving robustness to structural constraints, while also reducing the computational time. Our methods provide new insights for enhancing AMR parsers and metrics. Our code is available at [https://www.github.com/babelscape/AMRs-Assemble](https://www.github.com/babelscape/AMRs-Assemble).",
+    "authors": [
+      "Abelardo Carlos Mart\u00ednez Lorenzo",
+      "Pere Llu\u00eds Huguet Cabot",
+      "Roberto Navigli"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.137",
+    "point2d": [
+      -25.858163833618164,
+      -63.23247528076172
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 1049,
+    "title": "MolXPT: Wrapping Molecules with Text for Generative Pre-training",
+    "abstract": "Generative pre-trained Transformer (GPT) has demonstrates its great success in natural language processing and related techniques have been adapted into molecular modeling. Considering that text is the most important record for scientific discovery, in this paper, we propose MolXPT, a unified language model of text and molecules pre-trained on SMILES (a sequence representation of molecules) wrapped by text. Briefly, we detect the molecule names in each sequence and replace them to the corresponding SMILES. In this way, the SMILES could leverage the information from surrounding text, and vice versa. The above wrapped sequences, text sequences from PubMed and SMILES sequences from PubChem are all fed into a language model for pre-training. Experimental results demonstrate that MolXPT outperforms strong baselines of molecular property prediction on MoleculeNet, performs comparably to the best model in text-molecule translation while using less than half of its parameters, and enables zero-shot molecular generation without finetuning.",
+    "authors": [
+      "Zequn Liu",
+      "Wei Zhang",
+      "Yingce Xia",
+      "Lijun Wu",
+      "Shufang Xie",
+      "Tao Qin",
+      "Ming Zhang",
+      "Tie-Yan Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.138",
+    "point2d": [
+      26.686220169067383,
+      -24.492935180664062
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 1050,
+    "title": "A Study on the Efficiency and Generalization of Light Hybrid Retrievers",
+    "abstract": "Hybrid retrievers can take advantage of both sparse and dense retrievers. Previous hybrid retrievers leverage indexing-heavy dense retrievers. In this work, we study \u201cIs it possible to reduce the indexing memory of hybrid retrievers without sacrificing performance\u201d? Driven by this question, we leverage an indexing-efficient dense retriever (i.e. DrBoost) and introduce a LITE retriever that further reduces the memory of DrBoost. LITE is jointly trained on contrastive learning and knowledge distillation from DrBoost. Then, we integrate BM25, a sparse retriever, with either LITE or DrBoost to form light hybrid retrievers. Our Hybrid-LITE retriever saves 13\\times memory while maintaining 98.0% performance of the hybrid retriever of BM25 and DPR. In addition, we study the generalization capacity of our light hybrid retrievers on out-of-domain dataset and a set of adversarial attacks datasets. Experiments showcase that light hybrid retrievers achieve better generalization performance than individual sparse and dense retrievers. Nevertheless, our analysis shows that there is a large room to improve the robustness of retrievers, suggesting a new research direction.",
+    "authors": [
+      "Man Luo",
+      "Shashank Jain",
+      "Anchit Gupta",
+      "Arash Einolghozati",
+      "Barlas Oguz",
+      "Debojeet Chatterjee",
+      "Xilun Chen",
+      "Chitta Baral",
+      "Peyman Heidari"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.139",
+    "point2d": [
+      15.578091621398926,
+      -10.69930648803711
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 1051,
+    "title": "The Mechanical Bard: An Interpretable Machine Learning Approach to Shakespearean Sonnet Generation",
+    "abstract": "We consider the automated generation of sonnets, a poetic form constrained according to meter, rhyme scheme, and length. Sonnets generally also use rhetorical figures, expressive language, and a consistent theme or narrative. Our constrained decoding approach allows for the generation of sonnets within preset poetic constraints, while using a relatively modest neural backbone. Human evaluation confirms that our approach produces Shakespearean sonnets that resemble human-authored sonnets, and which adhere to the genre\u2019s defined constraints and contain lyrical language and literary devices.",
+    "authors": [
+      "Edwin Agnew",
+      "Michelle Qiu",
+      "Lily Zhu",
+      "Sam Wiseman",
+      "Cynthia Rudin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.140",
+    "point2d": [
+      -31.26689910888672,
+      39.04438781738281
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1052,
+    "title": "When to Use Efficient Self Attention? Profiling Text, Speech and Image Transformer Variants",
+    "abstract": "We present the first unified study of the efficiency of self-attention-based Transformer variants spanning text, speech and vision. We identify input length thresholds (tipping points) at which efficient Transformer variants become more efficient than vanilla models, using a variety of efficiency metrics (latency, throughput, and memory). To conduct this analysis for speech, we introduce L-HuBERT, a novel local-attention variant of a self-supervised speech model. We observe that these thresholds are (a) much higher than typical dataset sequence lengths and (b) dependent on the metric and modality, showing that choosing the right model depends on modality, task type (long-form vs. typical context) and resource constraints (time vs. memory). By visualising the breakdown of the computational costs for transformer components, we also show that non-self-attention components exhibit significant computational costs. We release our profiling toolkit at https://github.com/ajd12342/profiling-transformers .",
+    "authors": [
+      "Anuj Diwan",
+      "Eunsol Choi",
+      "David Harwath"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.141",
+    "point2d": [
+      -59.41233444213867,
+      27.762475967407227
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 1053,
+    "title": "Evaluating Zero-Shot Event Structures: Recommendations for Automatic Content Extraction (ACE) Annotations",
+    "abstract": "Zero-shot event extraction (EE) methods infer richly structured event records from text, based only on a minimal user specification and no training examples, which enables flexibility in exploring and developing applications. Most event extraction research uses the Automatic Content Extraction (ACE) annotated dataset to evaluate supervised EE methods, but can it be used to evaluate zero-shot and other low-supervision EE? We describe ACE\u2019s event structures and identify significant ambiguities and issues in current evaluation practice, including (1) coreferent argument mentions, (2) conflicting argument head conventions, and (3) ignorance of modality and event class details. By sometimes mishandling these subtleties, current work may dramatically understate the actual performance of zero-shot and other low-supervision EE, considering up to 32% of correctly identified arguments and 25% of correctly ignored event mentions as false negatives. For each issue, we propose recommendations for future evaluations so the research community can better utilize ACE as an event evaluation resource.",
+    "authors": [
+      "Erica Cai",
+      "Brendan O\u2019Connor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.142",
+    "point2d": [
+      45.37607955932617,
+      -43.89741516113281
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 1054,
+    "title": "Event Extraction as Question Generation and Answering",
+    "abstract": "Recent work on Event Extraction has reframed the task as Question Answering (QA), with promising results. The advantage of this approach is that it addresses the error propagation issue found in traditional token-based classification approaches by directly predicting event arguments without extracting candidates first. However, the questions are typically based on fixed templates and they rarely leverage contextual information such as relevant arguments. In addition, prior QA-based approaches have difficulty handling cases where there are multiple arguments for the same role. In this paper, we propose QGA-EE, which enables a Question Generation (QG) model to generate questions that incorporate rich contextual information instead of using fixed templates. We also propose dynamic templates to assist the training of QG model. Experiments show that QGA-EE outperforms all prior single-task-based models on the ACE05 English dataset.",
+    "authors": [
+      "Di Lu",
+      "Shihao Ran",
+      "Joel Tetreault",
+      "Alejandro Jaimes"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.143",
+    "point2d": [
+      41.552978515625,
+      -44.53289031982422
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 1055,
+    "title": "Are Sample-Efficient NLP Models More Robust?",
+    "abstract": "Recent results in image classification and extractive question answering have observed that pre-trained models trained on less in-distribution data have better out-ofdistribution performance. However, it is unclear how broadly these trends hold. We conduct a large empirical study across three tasks, three broadly-applicable modeling interventions (increasing model size, using a different adaptation method, and pre-training on more data), and 14 diverse datasets to investigate the relationship between sample efficiency (amount of data needed to reach a given ID accuracy) and robustness (how models fare on OOD evaluation). We find that higher sample efficiency is only correlated with better average OOD robustness on some modeling interventions and tasks, but not others. On individual datasets, models with lower sample efficiency can even be more robust. These results suggest that general-purpose methods for improving sample efficiency are unlikely to yield universal OOD robustness improvements, since such improvements are highly dataset- and task-dependent. Even in an era of large, multi-purpose pre-trained models, task-specific decisions may often be necessary for OOD generalization.",
+    "authors": [
+      "Nelson F. Liu",
+      "Ananya Kumar",
+      "Percy Liang",
+      "Robin Jia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.144",
+    "point2d": [
+      3.534088134765625,
+      -8.759675025939941
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1056,
+    "title": "Diversity-Aware Coherence Loss for Improving Neural Topic Models",
+    "abstract": "The standard approach for neural topic modeling uses a variational autoencoder (VAE) framework that jointly minimizes the KL divergence between the estimated posterior and prior, in addition to the reconstruction loss. Since neural topic models are trained by recreating individual input documents, they do not explicitly capture the coherence between words on the corpus level. In this work, we propose a novel diversity-aware coherence loss that encourages the model to learn corpus-level coherence scores while maintaining high diversity between topics. Experimental results on multiple datasets show that our method significantly improves the performance of neural topic models without requiring any pretraining or additional parameters.",
+    "authors": [
+      "Raymond Li",
+      "Felipe Gonzalez-Pizarro",
+      "Linzi Xing",
+      "Gabriel Murray",
+      "Giuseppe Carenini"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.145",
+    "point2d": [
+      -31.96021842956543,
+      0.821961522102356
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 1057,
+    "title": "NarrowBERT: Accelerating Masked Language Model Pretraining and Inference",
+    "abstract": "Large-scale language model pretraining is a very successful form of self-supervised learning in natural language processing, but it is increasingly expensive to perform as the models and pretraining corpora have become larger over time. We propose NarrowBERT, a modified transformer encoder that increases the throughput for masked language model pretraining by more than 2x. NarrowBERT sparsifies the transformer model such that the self-attention queries and feedforward layers only operate on the masked tokens of each sentence during pretraining, rather than all of the tokens as with the usual transformer encoder. We also show that NarrowBERT increases the throughput at inference time by as much as 3.5x with minimal (or no) performance degradation on sentence encoding tasks like MNLI. Finally, we examine the performance of NarrowBERT on the IMDB and Amazon reviews classification and CoNLL NER tasks and show that it is also comparable to standard BERT performance.",
+    "authors": [
+      "Haoxin Li",
+      "Phillip Keung",
+      "Daniel Cheng",
+      "Jungo Kasai",
+      "Noah A. Smith"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.146",
+    "point2d": [
+      -30.133163452148438,
+      -26.061058044433594
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 1058,
+    "title": "S3HQA: A Three-Stage Approach for Multi-hop Text-Table Hybrid Question Answering",
+    "abstract": "Answering multi-hop questions over hybrid factual knowledge from the given text and table (TextTableQA) is a challenging task. Existing models mainly adopt a retriever-reader framework, which have several deficiencies, such as noisy labeling in training retriever, insufficient utilization of heterogeneous information over text and table, and deficient ability for different reasoning operations. In this paper, we propose a three-stage TextTableQA framework S3HQA, which comprises of retriever, selector, and reasoner. We use a retriever with refinement training to solve the noisy labeling problem. Then, a hybrid selector considers the linked relationships between heterogeneous data to select the most relevant factual knowledge. For the final stage, instead of adapting a reading comprehension module like in previous methods, we employ a generation-based reasoner to obtain answers. This includes two approaches: a row-wise generator and an LLM prompting generator (first time used in this task). The experimental results demonstrate that our method achieves competitive results in the few-shot setting. When trained on the full dataset, our approach outperforms all baseline methods, ranking first on the HybridQA leaderboard.",
+    "authors": [
+      "Fangyu Lei",
+      "Xiang Li",
+      "Yifan Wei",
+      "Shizhu He",
+      "Yiming Huang",
+      "Jun Zhao",
+      "Kang Liu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.147",
+    "point2d": [
+      73.4581527709961,
+      5.787700176239014
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1059,
+    "title": "Towards Fewer Hallucinations in Knowledge-Grounded Dialogue Generation via Augmentative and Contrastive Knowledge-Dialogue",
+    "abstract": "Existing knowledge-grounded open-domain dialogue generation models often face the hallucination problem, i.e. the dialogue generative model will persist in an inappropriate knowledge and generate responses that inconsistent with the facts. We argue that this problem mainly stems from the polarized optimization objectives and weak knowledge generation ability. To mitigate the hallucination, we take inspiration from human communicating that people will replay euphemistic responses for the unclear or unrecognizable knowledge, and propose an Augmentative and Contrastive Knowledge Dialogue Expansion Framework (ACK-DEF). ACK-DEF constructs the augmentative and contrastive knowledge dialogue samples, which consist of the knowledge of different degrees of errors and the response of manual design, to expand the original training set and smooth the polarized optimization objective that enables models to generate ground-truth with or without gold knowledge. Not only the knowledge, ACK-DEF also provides the tactful responses of manual design corresponding to the incomplete correct knowledge. Experimental results on the Wikipedia of Wizard dataset show that employing the ACK-DEF is effective to alleviate the hallucination problem.",
+    "authors": [
+      "Bin Sun",
+      "Yitong Li",
+      "Fei Mi",
+      "Fanhu Bie",
+      "Yiwei Li",
+      "Kan Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.148",
+    "point2d": [
+      14.090208053588867,
+      57.091949462890625
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1060,
+    "title": "AutoConv: Automatically Generating Information-seeking Conversations with Large Language Models",
+    "abstract": "Information-seeking conversation, which aims to help users gather information through conversation, has achieved great progress in recent years. However, the research is still stymied by the scarcity of training data. To alleviate this problem, we propose AutoConv for synthetic conversation generation, which takes advantage of the few-shot learning ability and generation capacity of large language models (LLM). Specifically, we formulate the conversation generation problem as a language modeling task, then finetune an LLM with a few human conversations to capture the characteristics of the information-seeking process and use it for generating synthetic conversations with high quality. Experimental results on two frequently-used datasets verify that AutoConv has substantial improvements over strong baselines and alleviates the dependence on human annotation. In addition, we also provide several analysis studies to promote future research.",
+    "authors": [
+      "Siheng Li",
+      "Cheng Yang",
+      "Yichun Yin",
+      "Xinyu Zhu",
+      "Zesen Cheng",
+      "Lifeng Shang",
+      "Xin Jiang",
+      "Qun Liu",
+      "Yujiu Yang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.149",
+    "point2d": [
+      12.935317993164062,
+      62.138526916503906
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 1061,
+    "title": "STT4SG-350: A Speech Corpus for All Swiss German Dialect Regions",
+    "abstract": "We present STT4SG-350, a corpus of Swiss German speech, annotated with Standard German text at the sentence level. The data is collected using a web app in which the speakers are shown Standard German sentences, which they translate to Swiss German and record. We make the corpus publicly available. It contains 343 hours of speech from all dialect regions and is the largest public speech corpus for Swiss German to date. Application areas include automatic speech recognition (ASR), text-to-speech, dialect identification, and speaker recognition. Dialect information, age group, and gender of the 316 speakers are provided. Genders are equally represented and the corpus includes speakers of all ages. Roughly the same amount of speech is provided per dialect region, which makes the corpus ideally suited for experiments with speech technology for different dialects. We provide training, validation, and test splits of the data. The test set consists of the same spoken sentences for each dialect region and allows a fair evaluation of the quality of speech technologies in different dialects. We train an ASR model on the training set and achieve an average BLEU score of 74.7 on the test set. The model beats the best published BLEU scores on 2 other Swiss German ASR test sets, demonstrating the quality of the corpus.",
+    "authors": [
+      "Michel Pl\u00fcss",
+      "Jan Deriu",
+      "Yanick Schraner",
+      "Claudio Paonessa",
+      "Julia Hartmann",
+      "Larissa Schmidt",
+      "Christian Scheller",
+      "Manuela H\u00fcrlimann",
+      "Tanja Samard\u017ei\u0107",
+      "Manfred Vogel",
+      "Mark Cieliebak"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.150",
+    "point2d": [
+      -72.65547943115234,
+      18.42672348022461
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 1062,
+    "title": "Teaching Small Language Models to Reason",
+    "abstract": "Chain of thought prompting successfully improves the reasoning capabilities of large language models, achieving state of the art results on a range of datasets. However, these reasoning capabilities only appear to emerge in models with at least tens of billions of parameters. In this paper, we explore the transfer of such reasoning capabilities to smaller models via knowledge distillation, also investigating model and dataset size trade-off. Specifically, we finetune a student model on the chain of thought outputs generated by a larger teacher model. Our experiments show that the proposed method improves task performance across arithmetic, commonsense and symbolic reasoning datasets. For example, the accuracy of T5 XXL on GSM8K improves from 8.11% to 21.99% and 18.42% when finetuned on PaLM 540B and GPT-3 175B generated chains of thought, respectively.",
+    "authors": [
+      "Lucie Charlotte Magister",
+      "Jonathan Mallinson",
+      "Jakub Adamek",
+      "Eric Malmi",
+      "Aliaksei Severyn"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.151",
+    "point2d": [
+      43.435848236083984,
+      -16.626977920532227
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 1063,
+    "title": "A Simple and Effective Framework for Strict Zero-Shot Hierarchical Classification",
+    "abstract": "In recent years, large language models (LLMs) have achieved strong performance on benchmark tasks, especially in zero or few-shot settings. However, these benchmarks often do not adequately address the challenges posed in the real-world, such as that of hierarchical classification. In order to address this challenge, we propose refactoring conventional tasks on hierarchical datasets into a more indicative long-tail prediction task.We observe LLMs are more prone to failure in these cases.To address these limitations, we propose the use of entailment-contradiction prediction in conjunction with LLMs, which allows for strong performance in a strict zero-shot setting. Importantly, our method does not require any parameter updates, a resource-intensive process and achieves strong performance across multiple datasets.",
+    "authors": [
+      "Rohan Bhambhoria",
+      "Lei Chen",
+      "Xiaodan Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.152",
+    "point2d": [
+      3.7864930629730225,
+      -24.417966842651367
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1064,
+    "title": "A Simple Concatenation can Effectively Improve Speech Translation",
+    "abstract": "A triple speech translation data comprises speech, transcription, and translation.In the end-to-end paradigm, text machine translation (MT) usually plays the role of a teacher model for the speech translation (ST) via knowledge distillation. Parameter sharing with the teacher is often adopted to construct the ST model architecture, however, the two modalities are independently fed and trained via different losses. This situation does not match ST\u2019s properties across two modalities and also limits the upper bound of the performance. Inspired by the works of video Transformer, we propose a simple unified cross-modal ST method, which concatenates speech and text as the input, and builds a teacher that can utilize both cross-modal information simultaneously. Experimental results show that in our unified ST framework, models can effectively utilize the auxiliary information from speech and text, and achieve compelling results on MuST-C datasets.",
+    "authors": [
+      "Linlin Zhang",
+      "Kai Fan",
+      "Boxing Chen",
+      "Luo Si"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.153",
+    "point2d": [
+      -67.8077392578125,
+      21.737590789794922
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 1065,
+    "title": "ScoNe: Benchmarking Negation Reasoning in Language Models With Fine-Tuning and In-Context Learning",
+    "abstract": "A number of recent benchmarks seek to assess how well models handle natural language negation. However, these benchmarks lack the controlled example paradigms that would allow us to infer whether a model had truly learned how negation morphemes semantically scope. To fill these analytical gaps, we present the Scoped Negation NLI (ScoNe-NLI) benchmark, which contains contrast sets of six examples with up to two negations where either zero, one, or both negative morphemes affect the NLI label. We use ScoNe-NLI to assess fine-tuning and in-context learning strategies. We find that RoBERTa and DeBERTa models solve ScoNe-NLI after many shot fine-tuning. For in-context learning, we test the latest InstructGPT models and find that most prompt strategies are not successful, including those using step-by-step reasoning. To better understand this result, we extend ScoNe with ScoNe-NLG, a sentence completion test set that embeds negation reasoning in short narratives. Here, InstructGPT is successful, which reveals the model can correctly reason about negation, but struggles to do so on NLI examples outside of its core pretraining regime.",
+    "authors": [
+      "Jingyuan S. She",
+      "Christopher Potts",
+      "Samuel R. Bowman",
+      "Atticus Geiger"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.154",
+    "point2d": [
+      42.098236083984375,
+      -12.062662124633789
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 1066,
+    "title": "Revisiting Automated Prompting: Are We Actually Doing Better?",
+    "abstract": "Current literature demonstrates that Large Language Models (LLMs) are great few-shot learners, and prompting significantly increases their performance on a range of downstream tasks in a few-shot learning setting. An attempt to automate human-led prompting followed, with some progress achieved. In particular, subsequent work demonstrates that automation can outperform fine-tuning in certain K-shot learning scenarios. In this paper, we revisit techniques for automated prompting on six different downstream tasks and a larger range of K-shot learning settings. We find that automated prompting does not consistently outperform simple manual prompting. Our work suggests that, in addition to fine-tuning, manual prompting should be used as a baseline in this line of research.",
+    "authors": [
+      "Yulin Zhou",
+      "Yiren Zhao",
+      "Ilia Shumailov",
+      "Robert Mullins",
+      "Yarin Gal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.155",
+    "point2d": [
+      -16.250173568725586,
+      -10.080114364624023
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 1067,
+    "title": "Mind the Gap between the Application Track and the Real World",
+    "abstract": "Recent advances in NLP have led to a rise in inter-disciplinary and application-oriented research. While this demonstrates the growing real-world impact of the field, research papers frequently feature experiments that do not account for the complexities of realistic data and environments. To explore the extent of this gap, we investigate the relationship between the real-world motivations described in NLP papers and the models and evaluation which comprise the proposed solution. We first survey papers from the NLP Applications track from ACL 2020 and EMNLP 2020, asking which papers have differences between their stated motivation and their experimental setting, and if so, mention them. We find that many papers fall short of considering real-world input and output conditions due to adopting simplified modeling or evaluation settings. As a case study, we then empirically show that the performance of an educational dialog understanding system deteriorates when used in a realistic classroom environment.",
+    "authors": [
+      "Ananya Ganesh",
+      "Jie Cao",
+      "E. Margaret Perkoff",
+      "Rosy Southwell",
+      "Martha Palmer",
+      "Katharina Kann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.156",
+    "point2d": [
+      15.36914348602295,
+      47.407840728759766
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1068,
+    "title": "How to Distill your BERT: An Empirical Study on the Impact of Weight Initialisation and Distillation Objectives",
+    "abstract": "Recently, various intermediate layer distillation (ILD) objectives have been shown to improve compression of BERT models via Knowledge Distillation (KD). However, a comprehensive evaluation of the objectives in both task-specific and task-agnostic settings is lacking. To the best of our knowledge, this is the first work comprehensively evaluating distillation objectives in both settings. We show that attention transfer gives the best performance overall. We also study the impact of layer choice when initializing the student from the teacher layers, finding a significant impact on the performance in task-specific distillation. For vanilla KD and hidden states transfer, initialisation with lower layers of the teacher gives a considerable improvement over higher layers, especially on the task of QNLI (up to an absolute percentage change of 17.8 in accuracy). Attention transfer behaves consistently under different initialisation settings. We release our code as an efficient transformer-based model distillation framework for further studies.",
+    "authors": [
+      "Xinpeng Wang",
+      "Leonie Weissweiler",
+      "Hinrich Sch\u00fctze",
+      "Barbara Plank"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.157",
+    "point2d": [
+      -47.17864990234375,
+      -23.073638916015625
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 1069,
+    "title": "ACTC: Active Threshold Calibration for Cold-Start Knowledge Graph Completion",
+    "abstract": "Self-supervised knowledge-graph completion (KGC) relies on estimating a scoring model over (entity, relation, entity)-tuples, for example, by embedding an initial knowledge graph. Prediction quality can be improved by calibrating the scoring model, typically by adjusting the prediction thresholds using manually annotated examples. In this paper, we attempt for the first time cold-start calibration for KGC, where no annotated examples exist initially for calibration, and only a limited number of tuples can be selected for annotation.Our new method ACTC finds good per-relation thresholds efficiently based on a limited set of annotated tuples. Additionally to a few annotated tuples, ACTC also leverages unlabeled tuples by estimating their correctness with Logistic Regression or Gaussian Process classifiers. We also experiment with different methods for selecting candidate tuples for annotation: density-based and random selection. Experiments with five scoring models and an oracle annotator show an improvement of 7% points when using ACTC in the challenging setting with an annotation budget of only 10 tuples, and an average improvement of 4% points over different budgets.",
+    "authors": [
+      "Anastasiia Sedova",
+      "Benjamin Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.158",
+    "point2d": [
+      57.651153564453125,
+      -65.85623931884766
+    ],
+    "cluster": 45.0
+  },
+  {
+    "idx": 1070,
+    "title": "Task-Aware Specialization for Efficient and Robust Dense Retrieval for Open-Domain Question Answering",
+    "abstract": "Given its effectiveness on knowledge-intensive natural language processing tasks, dense retrieval models have become increasingly popular. Specifically, the de-facto architecture for open-domain question answering uses two isomorphic encoders that are initialized from the same pretrained model but separately parameterized for questions and passages. This biencoder architecture is parameter-inefficient in that there is no parameter sharing between encoders. Further, recent studies show that such dense retrievers underperform BM25 in various settings. We thus propose a new architecture, Task-Aware Specialization for dEnse Retrieval (TASER), which enables parameter sharing by interleaving shared and specialized blocks in a single encoder. Our experiments on five question answering datasets show that TASER can achieve superior accuracy, surpassing BM25, while using about 60% of the parameters as bi-encoder dense retrievers. In out-of-domain evaluations, TASER is also empirically more robust than bi-encoder dense retrievers. Our code is available at https://github.com/microsoft/taser.",
+    "authors": [
+      "Hao Cheng",
+      "Hao Fang",
+      "Xiaodong Liu",
+      "Jianfeng Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.159",
+    "point2d": [
+      8.084159851074219,
+      -10.782065391540527
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1071,
+    "title": "Linear Classifier: An Often-Forgotten Baseline for Text Classification",
+    "abstract": "Large-scale pre-trained language models such as BERT are popular solutions for text classification.Due to the superior performance of these advanced methods, nowadays, people often directly train them for a few epochs and deploy the obtained model.In this opinion paper, we point out that this way may only sometimes get satisfactory results.We argue the importance of running a simple baseline like linear classifiers on bag-of-words features along with advanced methods.First, for many text data, linear methods show competitive performance, high efficiency, and robustness.Second, advanced models such as BERT may only achieve the best results if properly applied.Simple baselines help to confirm whether the results of advanced models are acceptable.Our experimental results fully support these points.",
+    "authors": [
+      "Yu-Chen Lin",
+      "Si-An Chen",
+      "Jie-Jyun Liu",
+      "Chih-Jen Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.160",
+    "point2d": [
+      -2.1908435821533203,
+      -19.79381561279297
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1072,
+    "title": "Randomized Positional Encodings Boost Length Generalization of Transformers",
+    "abstract": "Transformers have impressive generalization capabilities on tasks with a fixed context length. However, they fail to generalize to sequences of arbitrary length, even for seemingly simple tasks such as duplicating a string. Moreover, simply training on longer sequences is inefficient due to the quadratic computation complexity of the global attention mechanism. In this work, we demonstrate that this failure mode is linked to positional encodings being out-of-distribution for longer sequences (even for relative encodings) and introduce a novel family of positional encodings that can overcome this problem. Concretely, our randomized positional encoding scheme simulates the positions of longer sequences and randomly selects an ordered subset to fit the sequence\u2019s length. Our large-scale empirical evaluation of 6000 models across 15 algorithmic reasoning tasks shows that our method allows Transformers to generalize to sequences of unseen length (increasing test accuracy by 12.0% on average).",
+    "authors": [
+      "Anian Ruoss",
+      "Gr\u00e9goire Del\u00e9tang",
+      "Tim Genewein",
+      "Jordi Grau-Moya",
+      "R\u00f3bert Csord\u00e1s",
+      "Mehdi Bennani",
+      "Shane Legg",
+      "Joel Veness"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.161",
+    "point2d": [
+      -24.25556755065918,
+      -47.779109954833984
+    ],
+    "cluster": 27.0
+  },
+  {
+    "idx": 1073,
+    "title": "Table and Image Generation for Investigating Knowledge of Entities in Pre-trained Vision and Language Models",
+    "abstract": "In this paper, we propose a table and image generation task to verify how the knowledge about entities acquired from natural language is retained in Vision & Language (V & L) models. This task consists of two parts: the first is to generate a table containing knowledge about an entity and its related image, and the second is to generate an image from an entity with a caption and a table containing related knowledge of the entity. In both tasks, the model must know the entities used to perform the generation properly. We created the Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000 infoboxes in English Wikipedia articles to perform the proposed tasks. We evaluated the performance on the tasks with respect to the above research question using the V & L model OFA, which has achieved state-of-the-art results in multiple tasks. Experimental results show that OFA forgets part of its entity knowledge by pre-training as a complement to improve the performance of image related tasks.",
+    "authors": [
+      "Hidetaka Kamigaito",
+      "Katsuhiko Hayashi",
+      "Taro Watanabe"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.162",
+    "point2d": [
+      -54.56088638305664,
+      44.98957824707031
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1074,
+    "title": "Improving Grammar-based Sequence-to-Sequence Modeling with Decomposition and Constraints",
+    "abstract": "Neural QCFG is a grammar-based sequence-to-sequence model with strong inductive biases on hierarchical structures. It excels in interpretability and generalization but suffers from expensive inference. In this paper, we study two low-rank variants of Neural QCFG for faster inference with different trade-offs between efficiency and expressiveness. Furthermore, utilizing the symbolic interface provided by the grammar, we introduce two soft constraints over tree hierarchy and source coverage. We experiment with various datasets and find that our models outperform vanilla Neural QCFG in most settings.",
+    "authors": [
+      "Chao Lou",
+      "Kewei Tu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.163",
+    "point2d": [
+      -23.974609375,
+      -57.1866455078125
+    ],
+    "cluster": 41.0
+  },
+  {
+    "idx": 1075,
+    "title": "TeCS: A Dataset and Benchmark for Tense Consistency of Machine Translation",
+    "abstract": "Tense inconsistency frequently occurs in machine translation. However, there are few criteria to assess the model\u2019s mastery of tense prediction from a linguistic perspective. In this paper, we present a parallel tense test set, containing French-English 552 utterances. We also introduce a corresponding benchmark, tense prediction accuracy. With the tense test set and the benchmark, researchers are able to measure the tense consistency performance of machine translation systems for the first time.",
+    "authors": [
+      "Yiming Ai",
+      "Zhiwei He",
+      "Kai Yu",
+      "Rui Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "short",
+    "doi": "10.18653/v1/2023.acl-short.164",
+    "point2d": [
+      -69.17807006835938,
+      1.3227413892745972
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1076,
+    "title": "Human-in-the-loop Schema Induction",
+    "abstract": "Schema induction builds a graph representation explaining how events unfold in a scenario. Existing approaches have been based on information retrieval (IR) and information extraction (IE), often with limited human curation. We demonstrate a human-in-the-loop schema induction system powered by GPT-3. We first describe the different modules of our system, including prompting to generate schematic elements, manual edit of those elements, and conversion of those into a schema graph. By qualitatively comparing our system to previous ones, we show that our system not only transfers to new domains more easily than previous approaches, but also reduces efforts of human curation thanks to our interactive interface.",
+    "authors": [
+      "Tianyi Zhang",
+      "Isaac Tham",
+      "Zhaoyi Hou",
+      "Jiaxuan Ren",
+      "Leon Zhou",
+      "Hainiu Xu",
+      "Li Zhang",
+      "Lara Martin",
+      "Rotem Dror",
+      "Sha Li",
+      "Heng Ji",
+      "Martha Palmer",
+      "Susan Windisch Brown",
+      "Reece Suchocki",
+      "Chris Callison-Burch"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.1",
+    "point2d": [
+      56.421119689941406,
+      -44.1243782043457
+    ],
+    "cluster": 0.0
+  },
+  {
+    "idx": 1077,
+    "title": "PersLEARN: Research Training through the Lens of Perspective Cultivation",
+    "abstract": "Scientific research is inherently shaped by its authors\u2019 perspectives, influenced by various factorssuch as their personality, community, or society. Junior researchers often face challenges in identifying the perspectives reflected in the existing literature and struggle to develop their own viewpoints. In response to this issue, we introduce PersLEARN , a tool designed to facilitate the cultivation of scientific perspectives, starting from a basic seed idea and progressing to a well-articulated framework. By interacting with a prompt-based model, researchers can develop their perspectives explicitly. Our humanstudy reveals that scientific perspectives developed by students using PersLEARN exhibit a superior level of logical coherence and depth compared to those that did not. Furthermore, our pipeline outperforms baseline approaches across multiple domains of literature from various perspectives. These results suggest that PersLEARN could help foster a greater appreciation of diversity in scientific perspectives as an essential component of research training.",
+    "authors": [
+      "Yu-Zhe Shi",
+      "Shiqian Li",
+      "Xinyi Niu",
+      "Qiao Xu",
+      "Jiawen Liu",
+      "Yifan Xu",
+      "Shiyu Gu",
+      "Bingru He",
+      "Xinyang Li",
+      "Xinyu Zhao",
+      "Zijian Zhao",
+      "Yidong Lyu",
+      "Zhen Li",
+      "Sijia Liu",
+      "Lin Qiu",
+      "Jinhao Ji",
+      "Lecheng Ruan",
+      "Yuxi Ma",
+      "Wenjuan Han",
+      "Yixin Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.2",
+    "point2d": [
+      14.357820510864258,
+      17.908077239990234
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1078,
+    "title": "LAVIS: A One-stop Library for Language-Vision Intelligence",
+    "abstract": "We introduce LAVIS, an open-source deep learning library for LAnguage-VISion research and applications. LAVIS aims to serve as a one-stop comprehensive library that brings recent advancements in the language-vision field accessible for researchers and practitioners, as well as fertilizing future research and development. It features a unified interface to easily access state-of-the-art image-language, video-language models and common datasets. LAVIS supports training, evaluation and benchmarking on a rich variety of tasks, including multimodal classification, retrieval, captioning, visual question answering, dialogue and pre-training. In the meantime, the library is also highly extensible and configurable, facilitating future development and customization. In this technical report, we describe design principles, key components and functionalities of the library, and also present benchmarking results across common language-vision tasks.",
+    "authors": [
+      "Dongxu Li",
+      "Junnan Li",
+      "Hung Le",
+      "Guangsen Wang",
+      "Silvio Savarese",
+      "Steven C.H. Hoi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.3",
+    "point2d": [
+      -54.676761627197266,
+      35.84003448486328
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 1079,
+    "title": "Finspector: A Human-Centered Visual Inspection Tool for Exploring and Comparing Biases among Foundation Models",
+    "abstract": "Pre-trained transformer-based language models are becoming increasingly popular due to their exceptional performance on various benchmarks. However, concerns persist regarding the presence of hidden biases within these models, which can lead to discriminatory outcomes and reinforce harmful stereotypes. To address this issue, we propose Finspector, a human-centered visual inspection tool designed to detect biases in different categories through log-likelihood scores generated by language models. The goal of the tool is to enable researchers to easily identify potential biases using visual analytics, ultimately contributing to a fairer and more just deployment of these models in both academic and industrial settings. Finspector is available at https://github.com/IBM/finspector.",
+    "authors": [
+      "Bum Chul Kwon",
+      "Nandana Mihindukulasooriya"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.4",
+    "point2d": [
+      18.248336791992188,
+      29.543052673339844
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1080,
+    "title": "PrimeQA: The Prime Repository for State-of-the-Art Multilingual Question Answering Research and Development",
+    "abstract": "The field of Question Answering (QA) has made remarkable progress in recent years, thanks to the advent of large pre-trained language models, newer realistic benchmark datasets with leaderboards, and novel algorithms for key components such as retrievers and readers. In this paper, we introduce PrimeQA: a one-stop and open-source QA repository with an aim to democratize QA research and facilitate easy replication of state-of-the-art (SOTA) QA methods. PrimeQA supports core QA functionalities like retrieval and reading comprehension as well as auxiliary capabilities such as question generation. It has been designed as an end-to-end toolkit for various use cases: building front-end applications, replicating SOTA methods on public benchmarks, and expanding pre-existing methods. PrimeQA is available at: https://github.com/primeqa.",
+    "authors": [
+      "Avi Sil",
+      "Jaydeep Sen",
+      "Bhavani Iyer",
+      "Martin Franz",
+      "Kshitij Fadnis",
+      "Mihaela Bornea",
+      "Sara Rosenthal",
+      "Scott McCarley",
+      "Rong Zhang",
+      "Vishwajeet Kumar",
+      "Yulong Li",
+      "Md Arafat Sultan",
+      "Riyaz Bhat",
+      "Juergen Bross",
+      "Radu Florian",
+      "Salim Roukos"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.5",
+    "point2d": [
+      66.57666778564453,
+      14.98121452331543
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1081,
+    "title": "Lingxi: A Diversity-aware Chinese Modern Poetry Generation System",
+    "abstract": "Chinese modern poetry generation has been a challenging task. One issue is the Chinese word segmentation (CWS) which is critical to comprehend the Chinese language but was not always considered in common tokenization methods. Another is the decoding (sampling) method which may induce repetition and boredom and severely lower the diversity of the generated poetry. To address these issues, we present Lingxi, a diversity-aware Chinese modern poetry generation system. For the CWS issue, we propose a novel framework that incorporates CWS in the tokenization process. The proposed method can achieve a high vocabulary coverage rate with a reasonable vocabulary size. For the decoding method and the diversity issue, we propose a novel sampling algorithm that flattens the high likelihood part of the predicted distribution of the language model to emphasize the comparatively low-likelihood words and increase the diversity of generated poetry. Empirical results show that even when the top 60% of cumulative probability mass of the predicted distribution is flattened, our method achieves comparable or even better performance than baseline sampling methods. Our system is available at http://lingxi.website.",
+    "authors": [
+      "Xinran Zhang",
+      "Maosong Sun",
+      "Jiafeng Liu",
+      "Xiaobing Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.6",
+    "point2d": [
+      -29.54268455505371,
+      39.55897903442383
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1082,
+    "title": "Autodive: An Integrated Onsite Scientific Literature Annotation Tool",
+    "abstract": "Scientific literature is always available in Adobe\u2019s Portable Document Format (PDF), which is friendly for scientists to read. Compared with raw text, annotating directly on PDF documents can greatly improve the labeling efficiency of scientists whose annotation costs are very high. In this paper, we present Autodive, an integrated onsite scientific literature annotation tool for natural scientists and Natural Language Processing (NLP) researchers. This tool provides six core functions of annotation that support the whole lifecycle of corpus generation including i)annotation project management, ii)resource management, iii)ontology management, iv)manual annotation, v)onsite auto annotation, and vi)annotation task statistic. Two experiments are carried out to verify efficiency of the presented tool. A live demo of Autodive is available at http://autodive.sciwiki.cn. The source code is available at https://github.com/Autodive.",
+    "authors": [
+      "Yi Du",
+      "Ludi Wang",
+      "Mengyi Huang",
+      "Dongze Song",
+      "Wenjuan Cui",
+      "Yuanchun Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.7",
+    "point2d": [
+      16.676834106445312,
+      14.920530319213867
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1083,
+    "title": "A Practical Toolkit for Multilingual Question and Answer Generation",
+    "abstract": "Generating questions along with associated answers from a text has applications in several domains, such as creating reading comprehension tests for students, or improving document search by providing auxiliary questions and answers based on the query. Training models for question and answer generation (QAG) is not straightforward due to the expected structured output (i.e. a list of question and answer pairs), as it requires more than generating a single sentence. This results in a small number of publicly accessible QAG models. In this paper, we introduce AutoQG, an online service for multilingual QAG along with lmqg, an all-in-one python package for model fine-tuning, generation, and evaluation. We also release QAG models in eight languages fine-tuned on a few variants of pre-trained encoder-decoder language models, which can be used online via AutoQG or locally via lmqg. With these resources, practitioners of any level can benefit from a toolkit that includes a web interface for end users, and easy-to-use code for developers who require custom models or fine-grained controls for generation.",
+    "authors": [
+      "Asahi Ushio",
+      "Fernando Alva-Manchego",
+      "Jose Camacho-Collados"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.8",
+    "point2d": [
+      67.75311279296875,
+      15.157502174377441
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1084,
+    "title": "OpenSLU: A Unified, Modularized, and Extensible Toolkit for Spoken Language Understanding",
+    "abstract": "Spoken Language Understanding (SLU) is one of the core components of a task-oriented dialogue system, which aims to extract the semantic meaning of user queries (e.g., intents and slots). In this work, we introduce OpenSLU, an open-source toolkit to provide a unified, modularized, and extensible toolkit for spoken language understanding. Specifically, OpenSLU unifies 10 SLU models for both single-intent and multi-intent scenarios, which support both non-pretrained and pretrained models simultaneously. Additionally, OpenSLU is highly modularized and extensible by decomposing the model architecture, inference, and learning process into reusable modules, which allows researchers to quickly set up SLU experiments with highly flexible configurations. OpenSLU is implemented based on PyTorch, and released at https://github.com/LightChen233/OpenSLU.",
+    "authors": [
+      "Libo Qin",
+      "Qiguang Chen",
+      "Xiao Xu",
+      "Yunlong Feng",
+      "Wanxiang Che"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.9",
+    "point2d": [
+      -2.9427199363708496,
+      64.07303619384766
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1085,
+    "title": "SanskritShala: A Neural Sanskrit NLP Toolkit with Web-Based Interface for Pedagogical and Annotation Purposes",
+    "abstract": "We present a neural Sanskrit Natural Language Processing (NLP) toolkit named SanskritShala (a school of Sanskrit) to facilitate computational linguistic analyses for several tasks such as word segmentation, morphological tagging, dependency parsing, and compound type identification. Our systems currently report state-of-the-art performance on available benchmark datasets for all tasks. SanskritShala is deployed as a web-based application, which allows a user to get real-time analysis for the given input. It is built with easy-to-use interactive data annotation features that allow annotators to correct the system predictions when it makes mistakes. We publicly release the source codes of the 4 modules included in the toolkit, 7 word embedding models that have been trained on publicly available Sanskrit corpora and multiple annotated datasets such as word similarity, relatedness, categorization, analogy prediction to assess intrinsic properties of word embeddings. So far as we know, this is the first neural-based Sanskrit NLP toolkit that has a web-based interface and a number of NLP modules. We are sure that the people who are willing to work with Sanskrit will find it useful for pedagogical and annotative purposes. SanskritShala is available at: https://cnerg.iitkgp.ac.in/sanskritshala. The demo video of our platform can be accessed at: https://youtu.be/x0X31Y9k0mw4.",
+    "authors": [
+      "Jivnesh Sandhan",
+      "Anshul Agarwal",
+      "Laxmidhar Behera",
+      "Tushar Sandhan",
+      "Pawan Goyal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.10",
+    "point2d": [
+      -40.16293716430664,
+      -41.99506759643555
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1086,
+    "title": "LIDA: A Tool for Automatic Generation of Grammar-Agnostic Visualizations and Infographics using Large Language Models",
+    "abstract": "Systems that support users in the automatic creation of visualizations must address several subtasks - understand the semantics of data, enumerate relevant visualization goals and generate visualization specifications. In this work, we pose visualization generation as a multi-stage generation problem and argue that well-orchestrated pipelines based on large language models (LLMs) and image generation models (IGMs) are suitable to addressing these tasks. We present LIDA, a novel tool for generating grammar-agnostic visualizations and infographics. LIDA comprises of 4 modules - A SUMMARIZER that converts data into a rich but compact natural language summary, a GOAL EXPLORER that enumerates visualization goals given the data, a VISGENERATOR that generates, refines, executes and filters visualization code and an INFOGRAPHER module that yields data-faithful stylized graphics using IGMs. LIDA provides a python api, and a hybrid user interface (direct manipulation and multilingual natural language) for interactive chart, infographics and data story generation. Code and demo are available at this url - https://microsoft.github.io/lida/",
+    "authors": [
+      "Victor Dibia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.11",
+    "point2d": [
+      -49.41615295410156,
+      50.85731506347656
+    ],
+    "cluster": 0.0
+  },
+  {
+    "idx": 1087,
+    "title": "MetaPro Online: A Computational Metaphor Processing Online System",
+    "abstract": "Metaphoric expressions are a special linguistic phenomenon, frequently appearing in everyday language. Metaphors do not take their literal meanings in contexts, which may cause obstacles for language learners to understand them. Metaphoric expressions also reflect the cognition of humans via concept mappings, attracting great attention from cognitive science and psychology communities. Thus, we aim to develop a computational metaphor processing online system, termed MetaPro Online, that allows users without a coding background, e.g., language learners and linguists, to easily query metaphoricity labels, metaphor paraphrases, and concept mappings for non-domain-specific text. The outputs of MetaPro can be directly used by language learners and natural language processing downstream tasks because MetaPro is an end-to-end system.",
+    "authors": [
+      "Rui Mao",
+      "Xiao Li",
+      "Kai He",
+      "Mengshi Ge",
+      "Erik Cambria"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.12",
+    "point2d": [
+      7.8091630935668945,
+      -56.99569320678711
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 1088,
+    "title": "DIAGRAPH: An Open-Source Graphic Interface for Dialog Flow Design",
+    "abstract": "In this work, we present DIAGRAPH, an open-source graphical dialog flow editor built on the ADVISER toolkit. Our goal for this tool is threefold: 1) To support subject-experts to intuitively create complex and flexible dialog systems,2) To support rapid prototyping of dialog system behavior, e.g., for research, and 3) To provide a hands-on test bed for students learning about dialog systems.To facilitate this, DIAGRAPH aims to provide a clean and intuitive graphical interface for creating dialog systems without requiring any coding knowledge.Once a dialog graph has been created, it is automatically turned into a dialog system using state of the art language models. This allows for rapid prototyping and testing.Dialog designers can then distribute a link to their finished dialog system or embed it into a website.Additionally, to support scientific experiments and data collection, dialog designers can access chat logs. Finally, to verify the usability of DIAGRAPH, we performed evaluation with subject-experts who extensively worked with the tool and users testing it for the first time, receiving above average System Usability Scale (SUS) scores from both (82 out 100 and 75 out of 100, respectively).In this way, we hope DIAGRAPH helps reduce the barrier to entry for creating dialog interactions.",
+    "authors": [
+      "Dirk V\u00e4th",
+      "Lindsey Vanderlyn",
+      "Ngoc Thang Vu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.13",
+    "point2d": [
+      19.311784744262695,
+      66.97322082519531
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1089,
+    "title": "disco: a toolkit for Distributional Control of Generative Models",
+    "abstract": "Pre-trained language models and other generative models have revolutionized NLP and beyond. However, these models tend to reproduce undesirable biases present in their training data. Also, they may overlook patterns that are important but challenging to capture. To address these limitations, researchers have introduced distributional control techniques. These techniques, not limited to language, allow controlling the prevalence (i.e. expectations) of any features of interest in the model\u2019s outputs. Despite their potential, the widespread adoption of these techniques has been hindered by the difficulty in adapting the complex, disconnected code. Here, we present disco, an open-source Python library that brings these techniques to the broader public",
+    "authors": [
+      "Germ\u00e1n Kruszewski",
+      "Jos Rozen",
+      "Marc Dymetman"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.14",
+    "point2d": [
+      -12.292795181274414,
+      12.534407615661621
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1090,
+    "title": "A Hyperparameter Optimization Toolkit for Neural Machine Translation Research",
+    "abstract": "Hyperparameter optimization is an important but often overlooked process in the research of deep learning technologies. To obtain a good model, one must carefully tune hyperparameters that determine the architecture and training algorithm. Insufficient tuning may result in poor results, while inequitable tuning may lead to exaggerated differences between models. We present a hyperparameter optimization toolkit for neural machine translation (NMT) to help researchers focus their time on the creative rather than the mundane. The toolkit is implemented as a wrapper on top of the open-source Sockeye NMT software. Using the Asynchronous Successive Halving Algorithm (ASHA), we demonstrate that it is possible to discover near-optimal models under a computational budget with little effort.Code: https://github.com/kevinduh/sockeye-recipes3Video demo: https://cs.jhu.edu/ kevinduh/j/demo.mp4",
+    "authors": [
+      "Xuan Zhang",
+      "Kevin Duh",
+      "Paul McNamee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.15",
+    "point2d": [
+      -63.92575454711914,
+      -10.195845603942871
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1091,
+    "title": "Japanese-to-English Simultaneous Dubbing Prototype",
+    "abstract": "Live video streaming has become an important form of communication such as virtual conferences. However, for cross-language communication in live video streaming, reading subtitles degrades the viewing experience. To address this problem, our simultaneous dubbing prototype translates and replaces the original speech of a live video stream in a simultaneous manner. Tests on a collection of 90 public videos show that our system achieves a low average latency of 11.90 seconds for smooth playback. Our method is general and can be extended to other language pairs.",
+    "authors": [
+      "Xiaolin Wang",
+      "Masao Utiyama",
+      "Eiichiro Sumita"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.16",
+    "point2d": [
+      -71.66463470458984,
+      25.047420501708984
+    ],
+    "cluster": 29.0
+  },
+  {
+    "idx": 1092,
+    "title": "VisKoP: Visual Knowledge oriented Programming for Interactive Knowledge Base Question Answering",
+    "abstract": "We present Visual Knowledge oriented Programming platform (<b>VisKoP</b>), a knowledge base question answering (KBQA) system that integrates human into the loop to edit and debug the knowledge base (KB) queries. VisKoP not only provides a neural program induction module, which converts natural language questions into knowledge oriented program language (KoPL), but also maps KoPL programs into graphical elements. KoPL programs can be edited with simple graphical operators, such as <i>\u201ddragging\u201d</i> to add knowledge operators and <i>\u201dslot filling\u201d</i> to designate operator arguments. Moreover, VisKoP provides auto-completion for its knowledge base schema and users can easily debug the KoPL program by checking its intermediate results. To facilitate the practical KBQA on a million-entity-level KB, we design a highly efficient KoPL execution engine for the back-end. Experiment results show that VisKoP is highly efficient and user interaction can fix a large portion of wrong KoPL programs to acquire the correct answer. The VisKoP online demo, highly efficient KoPL engine, and screencast video are now publicly available.",
+    "authors": [
+      "Zijun Yao",
+      "Yuanyong Chen",
+      "Xin Lv",
+      "Shulin Cao",
+      "Amy Xin",
+      "Jifan Yu",
+      "Hailong Jin",
+      "Jianjun Xu",
+      "Peng Zhang",
+      "Lei Hou",
+      "Juanzi Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.17",
+    "point2d": [
+      71.8163833618164,
+      -5.802855014801025
+    ],
+    "cluster": 0.0
+  },
+  {
+    "idx": 1093,
+    "title": "PEEP-Talk: A Situational Dialogue-based Chatbot for English Education",
+    "abstract": "English is acknowledged worldwide as a mode of communication. However, due to the absence of realistic practicing scenarios, students learning English as a foreign language (EFL) typically have limited chances to converse and share feedback with others. In this paper, we propose PEEP-Talk, a real-world situational dialogue-based chatbot designed for English education. It also naturally switches to a new topic or situation in response to out-of-topic utterances, which are common among English beginners. Furthermore, PEEP-Talk provides feedback score on conversation and grammar error correction. We performed automatic and user evaluations to validate performance and education efficiency of our system. The results show that PEEP-Talk generates appropriate responses in various real-life situations while providing accurate feedback to learners. Moreover, we demonstrate a positive impact on English-speaking, grammar, and English learning anxiety, implying that PEEP-Talk can lower the barrier to learning natural conversation in effective ways.",
+    "authors": [
+      "Seugnjun Lee",
+      "Yoonna Jang",
+      "Chanjun Park",
+      "Jungseob Lee",
+      "Jaehyung Seo",
+      "Hyeonseok Moon",
+      "Sugyeong Eo",
+      "Seounghoon Lee",
+      "Bernardo Yahya",
+      "Heuiseok Lim"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.18",
+    "point2d": [
+      16.82770347595215,
+      68.8749771118164
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1094,
+    "title": "OpenTIPE: An Open-source Translation Framework for Interactive Post-Editing Research",
+    "abstract": "Despite the latest improvements on machine translation, professional translators still must review and post-edit the automatic output to ensure high-quality translations. The research on automating this process lacks an interactive post-editing environment implemented for this purpose; therefore, current approaches do not consider the human interactions that occur in real post-editing scenarios. To address this issue, we present OpenTIPE, a flexible and extensible framework that aims at supporting research on interactive post-editing. Specifically, the interactive environment of OpenTIPE allows researchers to explore human-centered approaches for the post-editing task. We release the OpenTIPE source code and showcase its main functionalities with a demonstration video and an online live demo.",
+    "authors": [
+      "Fabian Landwehr",
+      "Thomas Steinmann",
+      "Laura Mascarell"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.19",
+    "point2d": [
+      -64.28861236572266,
+      1.4170397520065308
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1095,
+    "title": "TencentPretrain: A Scalable and Flexible Toolkit for Pre-training Models of Different Modalities",
+    "abstract": "Recently, the success of pre-training in text domain has been fully extended to vision, audio, and cross-modal scenarios. The proposed pre-training models of different modalities are showing a rising trend of homogeneity in their model structures, which brings the opportunity to implement different pre-training models within a uniform framework. In this paper, we present TencentPretrain, a toolkit supporting pre-training models of different modalities. The core feature of TencentPretrain is the modular design. The toolkit uniformly divides pre-training models into 5 components: embedding, encoder, target embedding, decoder, and target. As almost all of common modules are provided in each component, users can choose the desired modules from different components to build a complete pre-training model. The modular design enables users to efficiently reproduce existing pre-training models or build brand-new one. We test the toolkit on text, vision, and audio benchmarks and show that it can match the performance of the original implementations.",
+    "authors": [
+      "Zhe Zhao",
+      "Yudong Li",
+      "Cheng Hou",
+      "Jing Zhao",
+      "Rong Tian",
+      "Weijie Liu",
+      "Yiren Chen",
+      "Ningyuan Sun",
+      "Haoyan Liu",
+      "Weiquan Mao",
+      "Han Guo",
+      "Weigang Gou",
+      "Taiqiang Wu",
+      "Tao Zhu",
+      "Wenhang Shi",
+      "Chen Chen",
+      "Shan Huang",
+      "Sihong Chen",
+      "Liqun Liu",
+      "Feifei Li",
+      "Xiaoshuai Chen",
+      "Xingwu Sun",
+      "Zhanhui Kang",
+      "Xiaoyong Du",
+      "Linlin Shen",
+      "Kimmo Yan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.20",
+    "point2d": [
+      -61.548465728759766,
+      29.103002548217773
+    ],
+    "cluster": 16.0
+  },
+  {
+    "idx": 1096,
+    "title": "NeuroX Library for Neuron Analysis of Deep NLP Models",
+    "abstract": "Neuron analysis provides insights into how knowledge is structured in representations and discovers the role of neurons in the network. In addition to developing an understanding of our models, neuron analysis enables various applications such as debiasing, domain adaptation and architectural search. We present NeuroX, a comprehensive open-source toolkit to conduct neuron analysis of natural language processing models. It implements various interpretation methods under a unified API, and provides a framework for data processing and evaluation, thus making it easier for researchers and practitioners to perform neuron analysis. The Python toolkit is available at https://www.github.com/fdalvi/NeuroX.Demo Video available at: https://youtu.be/mLhs2YMx4u8",
+    "authors": [
+      "Fahim Dalvi",
+      "Hassan Sajjad",
+      "Nadir Durrani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.21",
+    "point2d": [
+      -38.421573638916016,
+      -33.518577575683594
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 1097,
+    "title": "SciLit: A Platform for Joint Scientific Literature Discovery, Summarization and Citation Generation",
+    "abstract": "Scientific writing involves retrieving, summarizing, and citing relevant papers, which can be time-consuming processes. Although in many workflows these processes are serially linked, there are opportunities for natural language processing (NLP) to provide end-to-end assistive tools. We propose SciLit, a pipeline that automatically recommends relevant papers, extracts highlights, and suggests a reference sentence as a citation of a paper, taking into consideration the user-provided context and keywords. SciLit efficiently recommends papers from large databases of hundreds of millions of papers using a two-stage pre-fetching and re-ranking literature search system that flexibly deals with addition and removal of a paper database. We provide a convenient user interface that displays the recommended papers as extractive summaries and that offers abstractively-generated citing sentences which are aligned with the provided context and which mention the chosen keyword(s). Our assistive tool for literature discovery and scientific writing is available at https://scilit.vercel.app",
+    "authors": [
+      "Nianlong Gu",
+      "Richard H.R. Hahnloser"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.22",
+    "point2d": [
+      16.103483200073242,
+      16.46181297302246
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1098,
+    "title": "Massively Multi-Lingual Event Understanding: Extraction, Visualization, and Search",
+    "abstract": "In this paper, we present ISI-Clear, a state-of-the-art, cross-lingual, zero-shot event extraction system and accompanying user interface for event visualization & search. Using only English training data, ISI-Clear makes global events available on-demand, processing user-supplied text in 100 languages ranging from Afrikaans to Yiddish. We provide multiple event-centric views of extracted events, including both a graphical representation and a document-level summary. We also integrate existing cross-lingual search algorithms with event extraction capabilities to provide cross-lingual event-centric search, allowing English-speaking users to search over events automatically extracted from a corpus of non-English documents, using either English natural language queries (e.g. \u201ccholera outbreaks in Iran\u201d) or structured queries (e.g. find all events of type Disease-Outbreak with agent \u201ccholera\u201d and location \u201cIran\u201d).",
+    "authors": [
+      "Chris Jenkins",
+      "Shantanu Agarwal",
+      "Joel Barry",
+      "Steven Fincke",
+      "Elizabeth Boschee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.23",
+    "point2d": [
+      48.63985061645508,
+      -44.969696044921875
+    ],
+    "cluster": 28.0
+  },
+  {
+    "idx": 1099,
+    "title": "YANMTT: Yet Another Neural Machine Translation Toolkit",
+    "abstract": "In this paper, we present our open-source neural machine translation (NMT) toolkit called \u201cYet Another Neural Machine Translation Toolkit\u201d abbreviated as YANMTT - https://github.com/prajdabre/yanmtt, which is built on top of the HuggingFace Transformers library. YANMTT focuses on transfer learning and enables easy pre-training and fine-tuning of sequence-to-sequence models at scale. It can be used for training parameter-heavy models with minimal parameter sharing and efficient, lightweight models via heavy parameter sharing. Additionally, it supports parameter-efficient fine-tuning (PEFT) through adapters and prompts. Our toolkit also comes with a user interface that can be used to demonstrate these models and visualize various parts of the model. Apart from these core features, our toolkit also provides other advanced functionalities such as but not limited to document/multi-source NMT, simultaneous NMT, mixtures-of-experts, model compression and continual learning.",
+    "authors": [
+      "Raj Dabre",
+      "Diptesh Kanojia",
+      "Chinmay Sawant",
+      "Eiichiro Sumita"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.24",
+    "point2d": [
+      -64.9065170288086,
+      -10.004406929016113
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1100,
+    "title": "XMD: An End-to-End Framework for Interactive Explanation-Based Debugging of NLP Models",
+    "abstract": "NLP models are susceptible to learning spurious biases (i.e., bugs) that work on some datasets but do not properly reflect the underlying task. Explanation-based model debugging aims to resolve spurious biases by showing human users explanations of model behavior, asking users to give feedback on the behavior, thenusing the feedback to update the model. While existing model debugging methods have shown promise, their prototype-level implementations provide limited practical utility. Thus, we propose XMD: the first open-source, end-to-end framework for explanation-based model debugging. Given task- or instance-level explanations,users can flexibly provide various forms of feedback via an intuitive, web-based UI. After receiving user feedback, XMD automatically updates the model in real time, by regularizing the model so that its explanationsalign with the user feedback. The new model can then be easily deployed into real-world applications via Hugging Face. Using XMD, we can improve the model\u2019s OOD performance on text classification tasks by up to 18%.",
+    "authors": [
+      "Dong-Ho Lee",
+      "Akshen Kadakia",
+      "Brihi Joshi",
+      "Aaron Chan",
+      "Ziyi Liu",
+      "Kiran Narahari",
+      "Takashi Shibuya",
+      "Ryosuke Mitani",
+      "Toshiyuki Sekiya",
+      "Jay Pujara",
+      "Xiang Ren"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.25",
+    "point2d": [
+      30.597530364990234,
+      -11.034926414489746
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 1101,
+    "title": "OpenDelta: A Plug-and-play Library for Parameter-efficient Adaptation of Pre-trained Models",
+    "abstract": "The scale of large pre-trained models (PTMs) poses significant challenges in adapting to downstream tasks due to the high optimization overhead and storage costs associated with full-parameter fine-tuning. To address this, many studies explore parameter-efficient tuning methods, also framed as \u201cdelta tuning\u201d in Ding et al. (2022), which updates only a small subset of parameters, known as \u201cdelta modules\u201d, while keeping the backbone model\u2019s parameters fixed. However, the practicality and flexibility of delta tuning have been limited due to existing implementations that directly modify the code of the backbone PTMs and hard-code specific delta tuning methods for each PTM. In this paper, we present OpenDelta, an open-source library that overcomes these limitations by providing a plug-and-play implementation of various delta tuning methods. Our novel techniques eliminate the need to modify the backbone PTMs\u2019 code, making OpenDelta compatible with different, even novel PTMs. OpenDelta is designed to be simple, modular, and extensible, providing a comprehensive platform for researchers and practitioners to adapt large PTMs efficiently.",
+    "authors": [
+      "Shengding Hu",
+      "Ning Ding",
+      "Weilin Zhao",
+      "Xingtai Lv",
+      "Zhen Zhang",
+      "Zhiyuan Liu",
+      "Maosong Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.26",
+    "point2d": [
+      -35.076560974121094,
+      -13.721684455871582
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1102,
+    "title": "Hierarchy Builder: Organizing Textual Spans into a Hierarchy to Facilitate Navigation",
+    "abstract": "Information extraction systems often producehundreds to thousands of strings on a specifictopic. We present a method that facilitatesbetter consumption of these strings, in an ex-ploratory setting in which a user wants to bothget a broad overview of what\u2019s available, and achance to dive deeper on some aspects. The sys-tem works by grouping similar items together,and arranging the remaining items into a hierar-chical navigable DAG structure. We apply themethod to medical information extraction.",
+    "authors": [
+      "Itay Yair",
+      "Hillel Taub-Tabib",
+      "Yoav Goldberg"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.27",
+    "point2d": [
+      32.423248291015625,
+      -57.9422721862793
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1103,
+    "title": "CARE: Collaborative AI-Assisted Reading Environment",
+    "abstract": "Recent years have seen impressive progress in AI-assisted writing, yet the developments in AI-assisted reading are lacking. We propose inline commentary as a natural vehicle for AI-based reading assistance, and present CARE: the first open integrated platform for the study of inline commentary and reading. CARE facilitates data collection for inline commentaries in a commonplace collaborative reading environment, and provides a framework for enhancing reading with NLP-based assistance, such as text classification, generation or question answering. The extensible behavioral logging allows unique insights into the reading and commenting behavior, and flexible configuration makes the platform easy to deploy in new scenarios. To evaluate CARE in action, we apply the platform in a user study dedicated to scholarly peer review. CARE facilitates the data collection and study of inline commentary in NLP, extrinsic evaluation of NLP assistance, and application prototyping. We invite the community to explore and build upon the open source implementation of CARE.Github Repository: https://github.com/UKPLab/CAREPublic Live Demo: https://care.ukp.informatik.tu-darmstadt.de",
+    "authors": [
+      "Dennis Zyska",
+      "Nils Dycke",
+      "Jan Buchmann",
+      "Ilia Kuznetsov",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.28",
+    "point2d": [
+      -23.63068389892578,
+      37.57521438598633
+    ],
+    "cluster": 47.0
+  },
+  {
+    "idx": 1104,
+    "title": "The ROOTS Search Tool: Data Transparency for LLMs",
+    "abstract": "ROOTS is a 1.6TB multilingual text corpus developed for the training of BLOOM, currently the largest language model explicitly accompanied by commensurate data governance efforts. In continuation of these efforts, we present the ROOTS Search Tool: a search engine over the entire ROOTS corpus offering both fuzzy and exact search capabilities. ROOTS is the largest corpus to date that can be investigated this way. The ROOTS Search Tool is open-sourced and available on Hugging Face Spaces: https://huggingface.co/spaces/bigscience-data/roots-search. We describe our implementation and the possible use cases of our tool.",
+    "authors": [
+      "Aleksandra Piktus",
+      "Christopher Akiki",
+      "Paulo Villegas",
+      "Hugo Lauren\u00e7on",
+      "G\u00e9rard Dupont",
+      "Sasha Luccioni",
+      "Yacine Jernite",
+      "Anna Rogers"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.29",
+    "point2d": [
+      18.309898376464844,
+      9.983231544494629
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1105,
+    "title": "The OPUS-MT Dashboard \u2013 A Toolkit for a Systematic Evaluation of Open Machine Translation Models",
+    "abstract": "The OPUS-MT dashboard is a web-based platform that provides a comprehensive overview of open translation models. We focus on a systematic collection of benchmark results with verifiable translation performance and large coverage in terms of languages and domains. We provide results for in-house OPUS-MT and Tatoeba models as well as external models from the Huggingface repository and user-contributed translations. The functionalities of the evaluation tool include summaries of benchmarks for over 2,300 models covering 4,560 language directions and 294 languages, as well as the inspection of predicted translations against their human reference. We focus on centralization, reproducibility and coverage of MT evaluation combined with scalability. The dashboard can be accessed live at https://opus.nlpl.eu/dashboard/.",
+    "authors": [
+      "J\u00f6rg Tiedemann",
+      "Ona de Gibert"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.30",
+    "point2d": [
+      -68.04979705810547,
+      -0.9427087903022766
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1106,
+    "title": "The D-WISE Tool Suite: Multi-Modal Machine-Learning-Powered Tools Supporting and Enhancing Digital Discourse Analysis",
+    "abstract": "This work introduces the D-WISE Tool Suite (DWTS), a novel working environment for digital qualitative discourse analysis in the Digital Humanities (DH). The DWTS addresses limitations of current DH tools induced by the ever-increasing amount of heterogeneous, unstructured, and multi-modal data in which the discourses of contemporary societies are encoded. To provide meaningful insights from such data, our system leverages and combines state-of-the-art machine learning technologies from Natural Language Processing and Com-puter Vision. Further, the DWTS is conceived and developed by an interdisciplinary team ofcultural anthropologists and computer scientists to ensure the tool\u2019s usability for modernDH research. Central features of the DWTS are: a) import of multi-modal data like text, image, audio, and video b) preprocessing pipelines for automatic annotations c) lexical and semantic search of documents d) manual span, bounding box, time-span, and frame annotations e) documentation of the research process.",
+    "authors": [
+      "Florian Schneider",
+      "Tim Fischer",
+      "Fynn Petersen-Frey",
+      "Isabel Eiser",
+      "Gertraud Koch",
+      "Chris Biemann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.31",
+    "point2d": [
+      35.34033203125,
+      29.838855743408203
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1107,
+    "title": "OpenRT: An Open-source Framework for Reasoning Over Tabular Data",
+    "abstract": "There are a growing number of table pre-training methods proposed for reasoning over tabular data (e.g., question answering, fact checking, and faithful text generation). However, most existing methods are benchmarked solely on a limited number of datasets, varying in configuration, which leads to a lack of unified, standardized, fair, and comprehensive comparison between methods. This paper presents OpenRT, the first open-source framework for reasoning over tabular data, to reproduce existing table pre-training models for performance comparison and develop new models quickly. We implemented and compared six table pre-training models on four question answering, one fact checking, and one faithful text generation datasets. Moreover, to enable the community to easily construct new table reasoning datasets, we developed TaRAT, an annotation tool which supports multi-person collaborative annotations for various kinds of table reasoning tasks. The researchers are able to deploy the newly-constructed dataset to OpenRT and compare the performances of different baseline systems.",
+    "authors": [
+      "Yilun Zhao",
+      "Boyu Mi",
+      "Zhenting Qi",
+      "Linyong Nan",
+      "Minghao Guo",
+      "Arman Cohan",
+      "Dragomir Radev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.32",
+    "point2d": [
+      74.54938507080078,
+      5.860837459564209
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1108,
+    "title": "UINAUIL: A Unified Benchmark for Italian Natural Language Understanding",
+    "abstract": "This paper introduces the Unified Interactive Natural Understanding of the Italian Language (UINAUIL), a benchmark of six tasks for Italian Natural Language Understanding. We present a description of the tasks and software library that collects the data from the European Language Grid, harmonizes the data format, and exposes functionalities to facilitates data manipulation and the evaluation of custom models. We also present the results of tests conducted with available Italian and multilingual language models on UINAUIL, providing an updated picture of the current state of the art in Italian NLU.",
+    "authors": [
+      "Valerio Basile",
+      "Livio Bioglio",
+      "Alessio Bosca",
+      "Cristina Bosco",
+      "Viviana Patti"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.33",
+    "point2d": [
+      -21.570653915405273,
+      -37.25014114379883
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1109,
+    "title": "Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition and Relation Extraction",
+    "abstract": "The Zero-Shot Learning (ZSL) task pertains to the identification of entities or relations in texts that were not seen during training. ZSL has emerged as a critical research area due to the scarcity of labeled data in specific domains, and its applications have grown significantly in recent years. With the advent of large pretrained language models, several novel methods have been proposed, resulting in substantial improvements in ZSL performance. There is a growing demand, both in the research community and industry, for a comprehensive ZSL framework that facilitates the development and accessibility of the latest methods and pretrained models.In this study, we propose a novel ZSL framework called Zshot that aims to address the aforementioned challenges. Our primary objective is to provide a platform that allows researchers to compare different state-of-the-art ZSL methods with standard benchmark datasets. Additionally, we have designed our framework to support the industry with readily available APIs for production under the standard SpaCy NLP pipeline. Our API is extendible and evaluable, moreover, we include numerous enhancements such as boosting the accuracy with pipeline ensembling and visualization utilities available as a SpaCy extension.",
+    "authors": [
+      "Gabriele Picco",
+      "Marcos Martinez Galindo",
+      "Alberto Purpura",
+      "Leopold Fuchs",
+      "Vanessa Lopez",
+      "Thanh Lam Hoang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.34",
+    "point2d": [
+      33.75014114379883,
+      -78.54756164550781
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1110,
+    "title": "BiSync: A Bilingual Editor for Synchronized Monolingual Texts",
+    "abstract": "In our globalized world, a growing number of situations arise where people are required to communicate in one or several foreign languages. In the case of written communication, users with a good command of a foreign language may find assistance from computer-aided translation (CAT) technologies. These technologies often allow users to access external resources, such as dictionaries, terminologies or bilingual concordancers, thereby interrupting and considerably hindering the writing process. In addition, CAT systems assume that the source sentence is fixed and also restrict the possible changes on the target side. In order to make the writing process smoother, we present BiSync, a bilingual writing assistant that allows users to freely compose text in two languages, while maintaining the two monolingual texts synchronized. We also include additional functionalities, such as the display of alternative prefix translations and paraphrases, which are intended to facilitate the authoring of texts. We detail the model architecture used for synchronization and evaluate the resulting tool, showing that high accuracy can be attained with limited computational resources. The interface and models are publicly available at https://github.com/jmcrego/BiSync and a demonstration video can be watched on YouTube https://youtu.be/_l-ugDHfNgU.",
+    "authors": [
+      "Josep Crego",
+      "Jitao Xu",
+      "Fran\u00e7ois Yvon"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.35",
+    "point2d": [
+      -63.98710632324219,
+      1.827942967414856
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1111,
+    "title": "Riveter: Measuring Power and Social Dynamics Between Entities",
+    "abstract": "Riveter provides a complete easy-to-use pipeline for analyzing verb connotations associated with entities in text corpora. We prepopulate the package with connotation frames of sentiment, power, and agency, which have demonstrated usefulness for capturing social phenomena, such as gender bias, in a broad range of corpora. For decades, lexical frameworks have been foundational tools in computational social science, digital humanities, and natural language processing, facilitating multifaceted analysis of text corpora. But working with verb-centric lexica specifically requires natural language processing skills, reducing their accessibility to other researchers. By organizing the language processing pipeline, providing complete lexicon scores and visualizations for all entities in a corpus, and providing functionality for users to target specific research questions, Riveter greatly improves the accessibility of verb lexica and can facilitate a broad range of future research.",
+    "authors": [
+      "Maria Antoniak",
+      "Anjalie Field",
+      "Jimin Mun",
+      "Melanie Walsh",
+      "Lauren Klein",
+      "Maarten Sap"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.36",
+    "point2d": [
+      33.7536735534668,
+      31.564682006835938
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1112,
+    "title": "Fast Whitespace Correction with Encoder-Only Transformers",
+    "abstract": "The goal of whitespace correction is to fix space errors in arbitrary given text. For example, given the text \u201cwhi te space correctio nwithTransf or mers\u201d, produce \u201cwhitespace correction with Transformers\u201d. We compare two Transformer-based models, a character-level encoder-decoder model and a byte-level encoder-only model. We find that the encoder-only model is both faster and achieves higher quality. We provide an easy-to-use tool that is over 900 times faster than the previous best tool, with the same high quality. Our tool repairs text at a rate of over 200 kB/s on GPU, with a sequence-averaged F1-score ranging from 87.5% for hard-to-correct text up to 99% for text without any spaces.",
+    "authors": [
+      "Hannah Bast",
+      "Matthias Hertel",
+      "Sebastian Walter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.37",
+    "point2d": [
+      -36.299842834472656,
+      8.418474197387695
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1113,
+    "title": "ESPnet-ST-v2: Multipurpose Spoken Language Translation Toolkit",
+    "abstract": "ESPnet-ST-v2 is a revamp of the open-source ESPnet-ST toolkit necessitated by the broadening interests of the spoken language translation community. ESPnet-ST-v2 supports 1) offline speech-to-text translation (ST), 2) simultaneous speech-to-text translation (SST), and 3) offline speech-to-speech translation (S2ST) \u2013 each task is supported with a wide variety of approaches, differentiating ESPnet-ST-v2 from other open source spoken language translation toolkits. This toolkit offers state-of-the-art architectures such as transducers, hybrid CTC/attention, multi-decoders with searchable intermediates, time-synchronous blockwise CTC/attention, Translatotron models, and direct discrete unit models. In this paper, we describe the overall design, example models for each task, and performance benchmarking behind ESPnet-ST-v2, which is publicly available at https://github.com/espnet/espnet.",
+    "authors": [
+      "Brian Yan",
+      "Jiatong Shi",
+      "Yun Tang",
+      "Hirofumi Inaguma",
+      "Yifan Peng",
+      "Siddharth Dalmia",
+      "Peter Polak",
+      "Patrick Fernandes",
+      "Dan Berrebbi",
+      "Tomoki Hayashi",
+      "Xiaohui Zhang",
+      "Zhaoheng Ni",
+      "Moto Hira",
+      "Soumi Maiti",
+      "Juan Pino",
+      "Shinji Watanabe"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.38",
+    "point2d": [
+      -65.23681640625,
+      18.332561492919922
+    ],
+    "cluster": 37.0
+  },
+  {
+    "idx": 1114,
+    "title": "CB2: Collaborative Natural Language Interaction Research Platform",
+    "abstract": "CB2 is a multi-agent platform to study collaborative natural language interaction in a grounded task-oriented scenario. It includes a 3D game environment, a backend server designed to serve trained models to human agents, and various tools and processes to enable scalable studies. We deploy CB2 at https://cb2.ai as a system demonstration with a learned instruction following model.",
+    "authors": [
+      "Jacob Sharf",
+      "Mustafa Omer Gul",
+      "Yoav Artzi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.39",
+    "point2d": [
+      28.706750869750977,
+      55.699337005615234
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1115,
+    "title": "Inseq: An Interpretability Toolkit for Sequence Generation Models",
+    "abstract": "Past work in natural language processing interpretability focused mainly on popular classification tasks while largely overlooking generation settings, partly due to a lack of dedicated tools. In this work, we introduce Inseq, a Python library to democratize access to interpretability analyses of sequence generation models. Inseq enables intuitive and optimized extraction of models\u2019 internal information and feature importance scores for popular decoder-only and encoder-decoder Transformers architectures. We showcase its potential by adopting it to highlight gender biases in machine translation models and locate factual knowledge inside GPT-2. Thanks to its extensible interface supporting cutting-edge techniques such as contrastive feature attribution, Inseq can drive future advances in explainable natural language generation, centralizing good practices and enabling fair and reproducible model evaluations.",
+    "authors": [
+      "Gabriele Sarti",
+      "Nils Feldhus",
+      "Ludwig Sickert",
+      "Oskar van der Wal"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.40",
+    "point2d": [
+      -21.31083869934082,
+      13.235383987426758
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1116,
+    "title": "Pipeline for modeling causal beliefs from natural language",
+    "abstract": "We present a causal language analysis pipeline that leverages a Large Language Model to identify causal claims made in natural language documents, and aggregates claims across a corpus to produce a causal claim network. The pipeline then applies a clustering algorithm that groups causal claims based on their semantic topics. We demonstrate the pipeline by modeling causal belief systems surrounding the Covid-19 vaccine from tweets.",
+    "authors": [
+      "John Priniski",
+      "Ishaan Verma",
+      "Fred Morstatter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.41",
+    "point2d": [
+      30.971782684326172,
+      12.023388862609863
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1117,
+    "title": "TabGenie: A Toolkit for Table-to-Text Generation",
+    "abstract": "Heterogenity of data-to-text generation datasets limits the research on data-to-text generation systems. We present TabGenie \u2013 a toolkit which enables researchers to explore, preprocess, and analyze a variety of data-to-text generation datasets through the unified framework of table-to-text generation. In TabGenie, all inputs are represented as tables with associated metadata. The tables can be explored through a web interface, which also provides an interactive mode for debugging table-to-text generation, facilitates side-by-side comparison of generated system outputs, and allows easy exports for manual analysis. Furthermore, TabGenie is equipped with command line processing tools and Python bindings for unified dataset loading and processing. We release TabGenie as a PyPI package and provide its open-source code and a live demo at https://github.com/kasnerz/tabgenie.",
+    "authors": [
+      "Zden\u011bk Kasner",
+      "Ekaterina Garanina",
+      "Ondrej Platek",
+      "Ondrej Dusek"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.42",
+    "point2d": [
+      -16.8038272857666,
+      13.860544204711914
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1118,
+    "title": "An Efficient Conversational Smart Compose System",
+    "abstract": "Online conversation is a ubiquitous way to share information and connect everyone but repetitive idiomatic text typing takes users a lot of time.This paper demonstrates a simple yet effective cloud based smart compose system to improve human-to-human conversation efficiency. Heuristics from different perspectives are designed to achieve the best trade-off between quality and latency.From the modeling side, the decoder-only model exploited the previous turns of conversational history in a computation lightweight manner. Besides, a novel phrase tokenizer is proposed to reduce latency without losing the composing quality further. Additionally, the caching mechanism is applied to the serving framework. The demo video of the system is available at https://youtu.be/U1KXkaqr60g.We open-sourced our phrase tokenizer in https://github.com/tensorflow/text.",
+    "authors": [
+      "Yun Zhu",
+      "Xiayu Chen",
+      "Lei Shu",
+      "Bowen Tan",
+      "Xinying Song",
+      "Lijuan Liu",
+      "Maria Wang",
+      "Jindong Chen",
+      "Ning Ruan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.43",
+    "point2d": [
+      10.366293907165527,
+      62.37361145019531
+    ],
+    "cluster": 29.0
+  },
+  {
+    "idx": 1119,
+    "title": "Which Spurious Correlations Impact Reasoning in NLI Models? A Visual Interactive Diagnosis through Data-Constrained Counterfactuals",
+    "abstract": "We present a human-in-the-loop dashboard tailored to diagnosing potential spurious features that NLI models rely on for predictions. The dashboard enables users to generate diverse and challenging examples by drawing inspiration from GPT-3 suggestions. Additionally, users can receive feedback from a trained NLI model on how challenging the newly created example is and make refinements based on the feedback.Through our investigation, we discover several categories of spurious correlations that impact the reasoning of NLI models, which we group into three categories: Semantic Relevance, Logical Fallacies, and Bias. Based on our findings, we identify and describe various research opportunities, including diversifying training data and assessing NLI models\u2019 robustness by creating adversarial test suites.",
+    "authors": [
+      "Robin Chan",
+      "Afra Amini",
+      "Mennatallah El-Assady"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.44",
+    "point2d": [
+      29.076602935791016,
+      -3.2383241653442383
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 1120,
+    "title": "LaTeX2Solver: a Hierarchical Semantic Parsing of LaTeX Document into Code for an Assistive Optimization Modeling Application",
+    "abstract": "We demonstrate an interactive system to help operations research (OR) practitioners convert the mathematical formulation of optimization problems from TeX document format into the solver modeling language. In practice, a manual translation is cumbersome and time-consuming. Moreover, it requires an in-depth understanding of the problem description and a technical expertise to produce the modeling code. Thus, our proposed system TeX2Solver helps partially automate this conversion and help the users build optimization models more efficiently. In this paper, we describe its interface and the components of the hierarchical parsing system. A video demo walk-through is available online at http://bit.ly/3kuOm3x",
+    "authors": [
+      "Rindra Ramamonjison",
+      "Timothy Yu",
+      "Linzi Xing",
+      "Mahdi Mostajabdaveh",
+      "Xiaorui Li",
+      "Xiaojin Fu",
+      "Xiongwei Han",
+      "Yuanzhe Chen",
+      "Ren Li",
+      "Kun Mao",
+      "Yong Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.45",
+    "point2d": [
+      41.78861618041992,
+      -24.073081970214844
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 1121,
+    "title": "Alfred: A System for Prompted Weak Supervision",
+    "abstract": "Alfred is the first system for programmatic weak supervision (PWS) that creates training data for machine learning by prompting. In contrast to typical PWS systems where weak supervision sources are programs coded by experts, Alfred enables users to encode their subject matter expertise via natural language prompts for language and vision-language models. Alfred provides a simple Python interface for the key steps of this emerging paradigm, with a high-throughput backend for large-scale data labeling. Users can quickly create, evaluate, and refine their prompt-based weak supervision sources; map the results to weak labels; and resolve their disagreements with a label model. Alfred enables a seamless local development experience backed by models served from self-managed computing clusters. It automatically optimizes the execution of prompts with optimized batching mechanisms. We find that this optimization improves query throughput by 2.9x versus a naive approach. We present two example use cases demonstrating Alfred on YouTube comment spam detection and pet breeds classification. Alfred is open source, available at https://github.com/BatsResearch/alfred.",
+    "authors": [
+      "Peilin Yu",
+      "Stephen Bach"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.46",
+    "point2d": [
+      -3.7611351013183594,
+      -15.517059326171875
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1122,
+    "title": "OpenICL: An Open-Source Framework for In-context Learning",
+    "abstract": "In recent years, In-context Learning (ICL) has gained increasing attentionand emerged as the new paradigm for large language model (LLM) evaluation. Unlike traditional fine-tuning methods, ICL instead adapts the pre-trained models to unseen tasks without any parameter updates.However, the implementation of ICL is sophisticated due to the diverse retrieval and inference methods involved, as well as the varying pre-processing requirements for different models, datasets, and tasks. A unified and flexible framework for ICL is urgently needed to ease the implementation of the aforementioned components.To facilitate ICL research, we introduce OpenICL, an open-source toolkit for ICL and LLM evaluation. OpenICL is research-friendly with a highly flexible architecture that users can easily combine different components to suit their needs.It also provides various state-of-the-art retrieval and inference methods to streamline the process of adapting ICL to cutting-edge research.The effectiveness of OpenICL has been validated on a wide range of NLP tasks, including classification, QA, machine translation, and semantic parsing. As a side-product, we found OpenICL to be an efficient yet robust tool for LLMs evaluation. OpenICL is released at https://github.com/Shark-NLP/OpenICL.",
+    "authors": [
+      "Zhenyu Wu",
+      "Yaoxiang Wang",
+      "Jiacheng Ye",
+      "Zhiyong Wu",
+      "Jiangtao Feng",
+      "Jingjing Xu",
+      "Yu Qiao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.47",
+    "point2d": [
+      -15.34601879119873,
+      -26.52857208251953
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 1123,
+    "title": "Self-Supervised Sentence Polishing by Adding Engaging Modifiers",
+    "abstract": "Teachers often guide students to improve their essays by adding engaging modifiers to polish the sentences. In this work, we present the first study on automatic sentence polishing by adding modifiers. Since there is no available dataset for the new task, we first automatically construct a large number of parallel data by removing modifiers in the engaging sentences collected from public resources. Then we fine-tune LongLM to reconstruct the original sentences from the corrupted ones. Considering that much overlap between inputs and outputs may bias the model to completely copy the inputs, we split each source sentence into sub-sentences and only require the model to generate the modified sub-sentences. Furthermore, we design a retrieval augmentation algorithm to prompt the model to add suitable modifiers. Automatic and manual evaluation on the auto-constructed test set and real human texts show that our model can generate more engaging sentences with suitable modifiers than strong baselines while keeping fluency. We deploy the model at http://coai.cs.tsinghua.edu.cn/static/polishSent/. A demo video is available at https://youtu.be/Y6gFHOgSv8Y.",
+    "authors": [
+      "Zhexin Zhang",
+      "Jian Guan",
+      "Xin Cui",
+      "Yu Ran",
+      "Bo Liu",
+      "Minlie Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.48",
+    "point2d": [
+      -30.427839279174805,
+      26.918397903442383
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1124,
+    "title": "Effidit: An Assistant for Improving Writing Efficiency",
+    "abstract": "Writing assistants are valuable tools that can help writers improve their writing skills. We introduce Effidit (Efficient and Intelligent Editing), a digital writing assistant that facilitates users to write higher-quality text more efficiently through the use of Artificial Intelligence (AI) and Natural Language Processing (NLP) technologies. We significantly expand the capacities of a writing assistantby providing functions in three modules: text completion, hint recommendation, and writing refinement. Based on the above efforts, Effidit can efficiently assist users in creating their own text. Effidit has been deployed to several Tencent products and publicly released at https://effidit.qq.com/.",
+    "authors": [
+      "Shuming Shi",
+      "Enbo Zhao",
+      "Wei Bi",
+      "Deng Cai",
+      "Leyang Cui",
+      "Xinting Huang",
+      "Haiyun Jiang",
+      "Duyu Tang",
+      "Kaiqiang Song",
+      "Longyue Wang",
+      "Chenyan Huang",
+      "Guoping Huang",
+      "Yan Wang",
+      "Piji Li"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.49",
+    "point2d": [
+      -23.32770538330078,
+      37.182857513427734
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1125,
+    "title": "WizMap: Scalable Interactive Visualization for Exploring Large Machine Learning Embeddings",
+    "abstract": "Machine learning models often learn latent embedding representations that capture the domain semantics of their training data. These embedding representations are valuable for interpreting trained models, building new models, and analyzing new datasets. However, interpreting and using embeddings can be challenging due to their opaqueness, high dimensionality, and the large size of modern datasets. To tackle these challenges, we present WizMap, an interactive visualization tool to help researchers and practitioners easily explore large embeddings. With a novel multi-resolution embedding summarization method and a familiar map-like interaction design, WizMap enables users to navigate and interpret embedding spaces with ease. Leveraging modern web technologies such as WebGL and Web Workers, WizMap scales to millions of embedding points directly in users\u2019 web browsers and computational notebooks without the need for dedicated backend servers. WizMap is open-source and available at the following public demo link: https://poloclub.github.io/wizmap.",
+    "authors": [
+      "Zijie J. Wang",
+      "Fred Hohman",
+      "Duen Horng Chau"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.50",
+    "point2d": [
+      -7.3170952796936035,
+      -27.17514991760254
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 1126,
+    "title": "A System for Answering Simple Questions in Multiple Languages",
+    "abstract": "Our research focuses on the most prevalent type of queries\u2014 simple questions \u2014exemplified by questions like \u201cWhat is the capital of France?\u201d. These questions reference an entity such as \u201cFrance\u201d, which is directly connected (one hop) to the answer entity \u201cParis\u201d in the underlying knowledge graph (KG). We propose a multilingual Knowledge Graph Question Answering (KGQA) technique that orders potential responses based on the distance between the question\u2019s text embeddings and the answer\u2019s graph embeddings. A system incorporating this novel method is also described in our work.Through comprehensive experimentation using various English and multilingual datasets and two KGs \u2014 Freebase and Wikidata \u2014 we illustrate the comparative advantage of the proposed method across diverse KG embeddings and languages. This edge is apparent even against robust baseline systems, including seq2seq QA models, search-based solutions and intricate rule-based pipelines. Interestingly, our research underscores that even advanced AI systems like ChatGPT encounter difficulties when tasked with answering simple questions. This finding emphasizes the relevance and effectiveness of our approach, which consistently outperforms such systems. We are making the source code and trained models from our study publicly accessible to promote further advancements in multilingual KGQA.",
+    "authors": [
+      "Anton Razzhigaev",
+      "Mikhail Salnikov",
+      "Valentin Malykh",
+      "Pavel Braslavski",
+      "Alexander Panchenko"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.51",
+    "point2d": [
+      72.14401245117188,
+      3.3620855808258057
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1127,
+    "title": "KWJA: A Unified Japanese Analyzer Based on Foundation Models",
+    "abstract": "We present KWJA, a high-performance unified Japanese text analyzer based on foundation models.KWJA supports a wide range of tasks, including typo correction, word segmentation, word normalization, morphological analysis, named entity recognition, linguistic feature tagging, dependency parsing, PAS analysis, bridging reference resolution, coreference resolution, and discourse relation analysis, making it the most versatile among existing Japanese text analyzers.KWJA solves these tasks in a multi-task manner but still achieves competitive or better performance compared to existing analyzers specialized for each task.KWJA is publicly available under the MIT license at https://github.com/ku-nlp/kwja.",
+    "authors": [
+      "Nobuhiro Ueda",
+      "Kazumasa Omura",
+      "Takashi Kodama",
+      "Hirokazu Kiyomaru",
+      "Yugo Murawaki",
+      "Daisuke Kawahara",
+      "Sadao Kurohashi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.52",
+    "point2d": [
+      -41.60281753540039,
+      -40.50751876831055
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1128,
+    "title": "Disease Network Constructor: a Pathway Extraction and Visualization",
+    "abstract": "We present Disease Network Constructor (DNC), a system that extracts and visualizes a disease network, in which nodes are entities such as diseases, proteins, and genes, and edges represent regulation relation. We focused on the disease network derived through regulation events found in scientific articles on idiopathic pulmonary fibrosis (IPF). The front-end web-base user interface of DNC includes two-dimensional (2D) and 3D visualizations of the constructed disease network. The back-end system of DNC includes several natural language processing (NLP) techniques to process biomedical text including BERT-based tokenization on the basis of Bidirectional Encoder Representations from Transformers (BERT), flat and nested named entity recognition (NER), candidate generation and candidate ranking for entity linking (EL) or, relation extraction (RE), and event extraction (EE) tasks. We evaluated the end-to-end EL and end-to-end nested EE systems to determine the DNC\u2019s back-endimplementation performance. To the best of our knowledge, this is the first attempt that addresses neural NER, EL, RE, and EE tasks in an end-to-end manner that constructs a path-way visualization from events, which we name Disease Network Constructor. The demonstration video can be accessed from https://youtu.be/rFhWwAgcXE8. We release an online system for end users and the source code is available at https://github.com/aistairc/PRISM-APIs/.",
+    "authors": [
+      "Mohammad Golam Sohrab",
+      "Khoa Duong",
+      "Goran Topi\u0107",
+      "Masami Ikeda",
+      "Nozomi Nagano",
+      "Yayoi Natsume-Kitatani",
+      "Masakata Kuroda",
+      "Mari Itoh",
+      "Hiroya Takamura"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.53",
+    "point2d": [
+      34.60267639160156,
+      -43.194114685058594
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1129,
+    "title": "Petals: Collaborative Inference and Fine-tuning of Large Models",
+    "abstract": "Many NLP tasks benefit from using large language models (LLMs) that often have more than 100 billion parameters. With the release of BLOOM-176B and OPT-175B, everyone can download pretrained models of this scale. Still, using these models requires high-end hardware unavailable to many researchers. In some cases, LLMs can be used more affordably via RAM offloading or hosted APIs. However, these techniques have innate limitations: offloading is too slow for interactive inference, while APIs are not flexible enough for research that requires access to weights, attention or logits. In this work, we propose Petals - a system for inference and fine-tuning of large models collaboratively by joining the resources of multiple parties. We demonstrate that this strategy outperforms offloading for very large models, running inference of BLOOM-176B on consumer GPUs with \u22481 step per second, which is enough for many interactive LLM applications. Unlike most inference APIs, Petals also natively exposes hidden states of served models, allowing to train and share custom model extensions based on efficient fine-tuning methods. The system, its source code, and documentation are available at https://petals.mlVideo (2 min): https://youtu.be/F4muLI-0hTE",
+    "authors": [
+      "Alexander Borzunov",
+      "Dmitry Baranchuk",
+      "Tim Dettmers",
+      "Maksim Riabinin",
+      "Younes Belkada",
+      "Artem Chumachenko",
+      "Pavel Samygin",
+      "Colin Raffel"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.54",
+    "point2d": [
+      -40.66624450683594,
+      -18.673280715942383
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1130,
+    "title": "UKP-SQuARE v3: A Platform for Multi-Agent QA Research",
+    "abstract": "The continuous development of Question Answering (QA) datasets has drawn the research community\u2019s attention toward multi-domain models. A popular approach is to use multi-dataset models, which are models trained on multiple datasets to learn their regularities and prevent overfitting to a single dataset. However, with the proliferation of QA models in online repositories such as GitHub or Hugging Face, an alternative is becoming viable. Recent works have demonstrated that combining expert agents can yield large performance gains over multi-dataset models. To ease research in multi-agent models, we extend UKP-SQuARE, an online platform for QA research, to support three families of multi-agent systems: i) agent selection, ii) early-fusion of agents, and iii) late-fusion of agents. We conduct experiments to evaluate their inference speed and discuss the performance vs. speed trade-off compared to multi-dataset models. UKP-SQuARE is open-source and publicly available.",
+    "authors": [
+      "Haritz Puerto",
+      "Tim Baumg\u00e4rtner",
+      "Rachneet Sachdeva",
+      "Haishuo Fang",
+      "Hao Zhang",
+      "Sewin Tariverdian",
+      "Kexin Wang",
+      "Iryna Gurevych"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.55",
+    "point2d": [
+      60.937007904052734,
+      9.818188667297363
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1131,
+    "title": "Ranger: A Toolkit for Effect-Size Based Multi-Task Evaluation",
+    "abstract": "In this paper, we introduce Ranger - a toolkit to facilitate the easy use of effect-size-based meta-analysis for multi-task evaluation in NLP and IR. We observed that our communities often face the challenge of aggregating results over incomparable metrics and scenarios, which makes conclusions and take-away messages less reliable. With Ranger, we aim to address this issue by providing a task-agnostic toolkit that combines the effect of a treatment on multiple tasks into one statistical evaluation, allowing for comparison of metrics and computation of an overall summary effect. Our toolkit produces publication-ready forest plots that enable clear communication of evaluation results over multiple tasks. Our goal with the ready-to-use Ranger toolkit is to promote robust, effect-size-based evaluation and improve evaluation standards in the community. We provide two case studies for common IR and NLP settings to highlight Ranger\u2019s benefits.",
+    "authors": [
+      "Mete Sertkan",
+      "Sophia Althammer",
+      "Sebastian Hofst\u00e4tter"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.56",
+    "point2d": [
+      15.477324485778809,
+      10.254355430603027
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1132,
+    "title": "GAIA Search: Hugging Face and Pyserini Interoperability for NLP Training Data Exploration",
+    "abstract": "Noticing the urgent need to provide tools for fast and user-friendly qualitative analysis of large-scale textual corpora of the modern NLP, we propose to turn to the mature and well-tested methods from the domain of Information Retrieval (IR) - a research field with a long history of tackling TB-scale document collections. We discuss how Pyserini - a widely used toolkit for reproducible IR research can be integrated with the Hugging Face ecosystem of open-source AI libraries and artifacts. We leverage the existing functionalities of both platforms while proposing novel features further facilitating their integration. Our goal is to give NLP researchers tools that will allow them to develop retrieval-based instrumentation for their data analytics needs with ease and agility.We include a Jupyter Notebook-based walk through the core interoperability features, available on GitHub: https://github.com/huggingface/gaia.We then demonstrate how the ideas we present can be operationalized to create a powerful tool for qualitative data analysis in NLP. We present GAIA Search - a search engine built following previously laid out principles, giving access to four popular large-scale text collections. GAIA serves a dual purpose of illustrating the potential of methodologies we discuss but also as a standalone qualitative analysis tool that can be leveraged by NLP researchers aiming to understand datasets prior to using them in training. GAIA is hosted live on Hugging Face Spaces: https://huggingface.co/spaces/spacerini/gaia.",
+    "authors": [
+      "Aleksandra Piktus",
+      "Odunayo Ogundepo",
+      "Christopher Akiki",
+      "Akintunde Oladipo",
+      "Xinyu Zhang",
+      "Hailey Schoelkopf",
+      "Stella Biderman",
+      "Martin Potthast",
+      "Jimmy Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.57",
+    "point2d": [
+      17.90241241455078,
+      9.403011322021484
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1133,
+    "title": "DeepPavlov Dream: Platform for Building Generative AI Assistants",
+    "abstract": "An open-source DeepPavlov Dream Platform is specifically tailored for development of complex dialog systems like Generative AI Assistants. The stack prioritizes efficiency, modularity, scalability, and extensibility with the goal to make it easier to develop complex dialog systems from scratch. It supports modular approach to implementation of conversational agents enabling their development through the choice of NLP components and conversational skills from a rich library organized into the distributions of ready-for-use multi-skill AI assistant systems. In DeepPavlov Dream, multi-skill Generative AI Assistant consists of NLP components that extract features from user utterances, conversational skills that generate or retrieve a response, skill and response selectors that facilitate choice of relevant skills and the best response, as well as a conversational orchestrator that enables creation of multi-skill Generative AI Assistants scalable up to industrial grade AI assistants. The platform allows to integrate large language models into dialog pipeline, customize with prompt engineering, handle multiple prompts during the same dialog session and create simple multimodal assistants.",
+    "authors": [
+      "Diliara Zharikova",
+      "Daniel Kornev",
+      "Fedor Ignatov",
+      "Maxim Talimanchuk",
+      "Dmitry Evseev",
+      "Ksenya Petukhova",
+      "Veronika Smilga",
+      "Dmitry Karpov",
+      "Yana Shishkina",
+      "Dmitry Kosenko",
+      "Mikhail Burtsev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "demo",
+    "doi": "10.18653/v1/2023.acl-demo.58",
+    "point2d": [
+      19.5919189453125,
+      65.37812042236328
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1134,
+    "title": "ChatGPT vs Human-authored Text: Insights into Controllable Text Summarization and Sentence Style Transfer",
+    "abstract": "Large-scale language models, like ChatGPT, have garnered significant media attention and stunned the public with their remarkable capacity for generating coherent text from short natural language prompts. In this paper, we aim to conduct a systematic inspection of ChatGPT\u2019s performance in two controllable generation tasks, with respect to ChatGPT\u2019s ability to adapt its output to different target audiences (expert vs. layman) and writing styles (formal vs. informal). Additionally, we evaluate the faithfulness of the generated text, and compare the model\u2019s performance with human-authored texts. Our findings indicate that the stylistic variations produced by humans are considerably larger than those demonstrated by ChatGPT, and the generated texts diverge from human samples in several characteristics, such as the distribution of word types. Moreover, we observe that ChatGPT sometimes incorporates factual errors or hallucinations when adapting the text to suit a specific style.",
+    "authors": [
+      "Dongqi Pu",
+      "Vera Demberg"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.1",
+    "point2d": [
+      -24.978130340576172,
+      38.51234817504883
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1135,
+    "title": "Multi-Dialectal Representation Learning of Sinitic Phonology",
+    "abstract": "Machine learning techniques have shown their competence for representing and reasoning in symbolic systems such as language and phonology. In Sinitic Historical Phonology, notable tasks that could benefit from machine learning include the comparison of dialects and reconstruction of proto-languages systems. Motivated by this, this paper provides an approach for obtaining multi-dialectal representations of Sinitic syllables, by constructing a knowledge graph from structured phonological data ,then applying the BoxE technique from knowledge base learning. We applied unsupervised clustering techniques to the obtained representations to observe that the representations capture phonemic contrast from the input dialects. Furthermore, we trained classifiers to perform inference of unobserved Middle Chinese labels, showing the representations\u2019 potential for indicating archaic, proto-language features. The representations can be used for performing completion of fragmented Sinitic phonological knowledge bases, estimating divergences between different characters, or aiding the exploration and reconstruction of archaic features.",
+    "authors": [
+      "Zhibai Jia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.2",
+    "point2d": [
+      -39.60474395751953,
+      -45.0200080871582
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1136,
+    "title": "Prompt-based Zero-shot Text Classification with Conceptual Knowledge",
+    "abstract": "In recent years, pre-trained language models have garnered significant attention due to their effectiveness, which stems from the rich knowledge acquired during pre-training. To mitigate the inconsistency issues between pre-training tasks and downstream tasks and to facilitate the resolution of language-related issues, prompt-based approaches have been introduced, which are particularly useful in low-resource scenarios. However, existing approaches mostly rely on verbalizers to translate the predicted vocabulary to task-specific labels. The major limitations of this approach are the ignorance of potentially relevant domain-specific words and being biased by the pre-training data. To address these limitations, we propose a framework that incorporates conceptual knowledge for text classification in the extreme zero-shot setting. The framework includes prompt-based keyword extraction, weight assignment to each prompt keyword, and final representation estimation in the knowledge graph embedding space. We evaluated the method on four widely-used datasets for sentiment analysis and topic detection, demonstrating that it consistently outperforms recently-developed prompt-based approaches in the same experimental settings.",
+    "authors": [
+      "Yuqi Wang",
+      "Wei Wang",
+      "Qi Chen",
+      "Kaizhu Huang",
+      "Anh Nguyen",
+      "Suparna De"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.4",
+    "point2d": [
+      2.9293508529663086,
+      -26.40720558166504
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1137,
+    "title": "How do different tokenizers perform on downstream tasks in scriptio continua languages?: A case study in Japanese",
+    "abstract": "This paper investigates the effect of tokenizers on the downstream performance of pretrained language models (PLMs) in scriptio continua languages where no explicit spaces exist between words, using Japanese as a case study. The tokenizer for such languages often consists of a morphological analyzer and a subword tokenizer, requiring us to conduct a comprehensive study of all possible pairs. However, previous studies lack this comprehensiveness. We therefore train extensive sets of tokenizers, build a PLM using each, and measure the downstream performance on a wide range of tasks. Our results demonstrate that each downstream task has a different optimal morphological analyzer, and that it is better to use Byte-Pair-Encoding or Unigram rather than WordPiece as a subword tokenizer, regardless of the type of task.",
+    "authors": [
+      "Takuro Fujii",
+      "Koki Shibata",
+      "Atsuki Yamaguchi",
+      "Terufumi Morishita",
+      "Yasuhiro Sogawa"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.5",
+    "point2d": [
+      -40.02707290649414,
+      -40.287391662597656
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1138,
+    "title": "Semantic-Aware Dynamic Retrospective-Prospective Reasoning for Event-Level Video Question Answering",
+    "abstract": "Event-Level Video Question Answering (EVQA) requires complex reasoning across video events to obtain the visual information needed to provide optimal answers. However, despite significant progress in model performance, few studies have focused on using the explicit semantic connections between the question and visual information especially at the event level. There is need for using such semantic connections to facilitate complex reasoning across video frames. Therefore, we propose a semantic-aware dynamic retrospective-prospective reasoning approach for video-based question answering. Specifically, we explicitly use the Semantic Role Labeling (SRL) structure of the question in the dynamic reasoning process where we decide to move to the next frame based on which part of the SRL structure (agent, verb, patient, etc.) of the question is being focused on. We conduct experiments on a benchmark EVQA dataset - TrafficQA. Results show that our proposed approach achieves superior performance compared to previous state-of-the-art models. Our code is publicly available at https://github.com/lyuchenyang/Semantic-aware-VideoQA.",
+    "authors": [
+      "Chenyang Lyu",
+      "Tianbo Ji",
+      "Yvette Graham",
+      "Jennifer Foster"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.7",
+    "point2d": [
+      56.74937057495117,
+      2.0307939052581787
+    ],
+    "cluster": 22.0
+  },
+  {
+    "idx": 1139,
+    "title": "Jamp: Controlled Japanese Temporal Inference Dataset for Evaluating Generalization Capacity of Language Models",
+    "abstract": "Natural Language Inference (NLI) tasks involving temporal inference remain challenging for pre-trained language models (LMs). Although various datasets have been created for this task, they primarily focus on English and do not address the need for resources in other languages. It is unclear whether current LMs realize the generalization capacity for temporal inference across languages. In this paper, we present Jamp, a Japanese NLI benchmark focused on temporal inference. Our dataset includes a range of temporal inference patterns, which enables us to conduct fine-grained analysis. To begin the data annotation process, we create diverse inference templates based on the formal semantics test suites. We then automatically generate diverse NLI examples by using the Japanese case frame dictionary and well-designed templates while controlling the distribution of inference patterns and gold labels. We evaluate the generalization capacities of monolingual/multilingual LMs by splitting our dataset based on tense fragments (i.e., temporal inference patterns). Our findings demonstrate that LMs struggle with specific linguistic phenomena, such as habituality, indicating that there is potential for the development of more effective NLI models across languages.",
+    "authors": [
+      "Tomoki Sugimoto",
+      "Yasumasa Onoe",
+      "Hitomi Yanaka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.8",
+    "point2d": [
+      -31.93018341064453,
+      -40.46604537963867
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1140,
+    "title": "Constructing Multilingual Code Search Dataset Using Neural Machine Translation",
+    "abstract": "Code search is a task to find programming codes that semantically match the given natural language queries. Even though some of the existing datasets for this task are multilingual on the programming language side, their query data are only in English. In this research, we create a multilingual code search dataset in four natural and four programming languages using a neural machine translation model.Using our dataset, we pre-train and fine-tune the Transformer-based models and then evaluate them on multiple code search test sets.Our results show that the model pre-trained with all natural and programming language data has performed best in most cases. By applying back-translation data filtering to our dataset, we demonstrate that the translation quality affects the model\u2019s performance to a certain extent, but the data size matters more.",
+    "authors": [
+      "Ryo Sekizawa",
+      "Nan Duan",
+      "Shuai Lu",
+      "Hitomi Yanaka"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.10",
+    "point2d": [
+      -11.0078706741333,
+      -52.5370979309082
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 1141,
+    "title": "Multimodal Neural Machine Translation Using Synthetic Images Transformed by Latent Diffusion Model",
+    "abstract": "This study proposes a new multimodal neural machine translation (MNMT) model using synthetic images transformed by a latent diffusion model. MNMT translates a source language sentence based on its related image, but the image usually contains noisy information that are not relevant to the source language sentence.Our proposed method first generates a synthetic image corresponding to the content of the source language sentence by using a latent diffusion model and then performs translation based on the synthetic image. The experiments on the English-German translation tasks using the Multi30k dataset demonstrate the effectiveness of the proposed method.",
+    "authors": [
+      "Ryoya Yuasa",
+      "Akihiro Tamura",
+      "Tomoyuki Kajiwara",
+      "Takashi Ninomiya",
+      "Tsuneo Kato"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.12",
+    "point2d": [
+      -69.54627990722656,
+      40.921756744384766
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1142,
+    "title": "Enhancing Ancient Chinese Understanding with Derived Noisy Syntax Trees",
+    "abstract": "Despite the rapid development of neural-based models, syntax still plays a crucial role in modern natural language processing. However, few studies have incorporated syntactic information into ancient Chinese understanding tasks due to the lack of syntactic annotation. This paper explores the role of syntax in ancient Chinese understanding based on the noisy syntax trees from unsupervised derivation and modern Chinese syntax parsers. On top of that, we propose a novel syntax encoding component \u2013 confidence-based syntax encoding network (cSEN) to alleviate the side effects from the existing noise caused by unsupervised syntax derivation and the incompatibility between ancient and modern Chinese. Experiments on two typical ancient Chinese understanding tasks, ancient poetry theme classification and ancient-modern Chinese translation, demonstrate that syntactic information can effectively enhance the understanding of ancient Chinese over strong baselines, and that the proposed cSEN plays an important role in noisy scenarios.",
+    "authors": [
+      "Ping Wang",
+      "Shitou Zhang",
+      "Zuchao Li",
+      "Jingrui Hou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.15",
+    "point2d": [
+      -42.525386810302734,
+      -46.93519973754883
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1143,
+    "title": "The Turing Quest: Can Transformers Make Good NPCs?",
+    "abstract": "In this paper, we study the viability of the deployment of language models towards non-playable character (NPC) scripts, by introducing a novel pipeline for the automatic construction of NPC scripts using Transformer-based believable scripts for a variety of game genres and specifications. In addition, we propose a self-diagnosis method inspired by previous work to develop language models, tailored specifically to desirable NPC qualities such as coherency, believability, and degree of repetition. Finally, we propose a new benchmark, called The Turing Quest, which we use to show that the pipeline, when applied to GPT-3, can generate for a variety of game genres and contexts, NPC scripts that can fool judges in thinking they have been written by humans. We believe that these findings can greatly benefit both the gaming industry and its global community of users, since many current games continue to base their NPCs on manually-curated scripts that are resource-demanding and may curb the immersiveness and enjoyment of the user.",
+    "authors": [
+      "Qi Chen Gao",
+      "Ali Emami"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.17",
+    "point2d": [
+      30.99973487854004,
+      54.127559661865234
+    ],
+    "cluster": 35.0
+  },
+  {
+    "idx": 1144,
+    "title": "Making the Most Out of the Limited Context Length: Predictive Power Varies with Clinical Note Type and Note Section",
+    "abstract": "Recent advances in large language models have led to renewed interest in natural language processing in healthcare using the free text of clinical notes. One distinguishing characteristic of clinical notes is their long time span over multiple long documents. The unique structure of clinical notes creates a new design choice: when the context length for a language model predictor is limited, which part of clinical notes should we choose as the input? Existing studies either choose the inputs with domain knowledge or simply truncate them. We propose a framework to analyze the sections with high predictive power. Using MIMIC-III, we show that: 1) predictive power distribution is different between nursing notes and discharge notes and 2) combining different types of notes could improve performance when the context length is large. Our findings suggest that a carefully selected sampling function could enable more efficient information extraction from clinical notes.",
+    "authors": [
+      "Hongyi Zheng",
+      "Yixin Zhu",
+      "Lavender Jiang",
+      "Kyunghyun Cho",
+      "Eric Oermann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.18",
+    "point2d": [
+      29.811372756958008,
+      -38.88402557373047
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1145,
+    "title": "Intriguing Effect of the Correlation Prior on ICD-9 Code Assignment",
+    "abstract": "The Ninth Revision of the International Classification of Diseases (ICD-9) is a standardized coding system used to classify health conditions. It is used for billing, tracking individual patient conditions, and for epidemiology. The highly detailed and technical nature of the codes and their associated medical conditions make it difficult for humans to accurately record them. Researchers have explored the use of neural networks, particularly language models, for automated ICD-9 code assignment. However, the imbalanced distribution of ICD-9 codes leads to poor performance. One solution is to use domain knowledge to incorporate a useful prior. This paper evaluates the usefulness of the correlation bias: we hypothesize that correlations between ICD-9 codes and other medical codes could help improve language models\u2019 performance. We showed that while the correlation bias worsens the overall performance, the effect on individual class can be negative or positive. Performance on classes that are more imbalanced and less correlated with other codes is more sensitive to incorporating the correlation bias. This suggests that while the correlation bias has potential to improve ICD-9 code assignment in certain cases, the applicability criteria need to be more carefully studied.",
+    "authors": [
+      "Zihao Yang",
+      "Chenkang Zhang",
+      "Muru Wu",
+      "Xujin Liu",
+      "Lavender Jiang",
+      "Kyunghyun Cho",
+      "Eric Oermann"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.19",
+    "point2d": [
+      28.944791793823242,
+      -40.7750244140625
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1146,
+    "title": "Classical Out-of-Distribution Detection Methods Benchmark in Text Classification Tasks",
+    "abstract": "State-of-the-art models can perform well in controlled environments, but they often struggle when presented with out-of-distribution (OOD) examples, making OOD detection a critical component of NLP systems. In this paper, we focus on highlighting the limitations of existing approaches to OOD detection in NLP. Specifically, we evaluated eight OOD detection methods that are easily integrable into existing NLP systems and require no additional OOD data or model modifications. One of our contributions is providing a well-structured research environment that allows for full reproducibility of the results. Additionally, our analysis shows that existing OOD detection methods for NLP tasks are not yet sufficiently sensitive to capture all samples characterized by various types of distributional shifts. Particularly challenging testing scenarios arise in cases of background shift and randomly shuffled word order within in domain texts. This highlights the need for future work to develop more effective OOD detection approaches for the NLP problems, and our work provides a well-defined foundation for further research in this area.",
+    "authors": [
+      "Mateusz Baran",
+      "Joanna Baran",
+      "Mateusz W\u00f3jcik",
+      "Maciej Zi\u0119ba",
+      "Adam Gonczarek"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.20",
+    "point2d": [
+      -1.002434253692627,
+      -6.72821044921875
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1147,
+    "title": "Can LMs Store and Retrieve 1-to-N Relational Knowledge?",
+    "abstract": "It has been suggested that pretrained language models can be viewed as knowledge bases.One of the prerequisites for using language models as knowledge bases is how accurately they can store and retrieve world knowledge. It is already revealed that language models can store much 1-to-1 relational knowledge, such as \u201dcountry and its capital,\u201d with high memorization accuracy.On the other hand, world knowledge includes not only 1-to-1 but also 1-to-N relational knowledge, such as \u201dparent and children.\u201dHowever, it is not clear how accurately language models can handle 1-to-N relational knowledge.To investigate language models\u2019 abilities toward 1-to-N relational knowledge, we start by designing the problem settings. Specifically, we organize the character of 1-to-N relational knowledge and define two essential skills: (i) memorizing multiple objects individually and (ii) retrieving multiple stored objects without excesses or deficiencies at once. We inspect LMs\u2019 ability to handle 1-to-N relational knowledge on the controlled synthesized data.As a result, we report that it is possible to memorize multiple objects with high accuracy, but generalizing the retrieval ability (expressly, enumeration) is challenging.",
+    "authors": [
+      "Haruki Nagasawa",
+      "Benjamin Heinzerling",
+      "Kazuma Kokuta",
+      "Kentaro Inui"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.22",
+    "point2d": [
+      44.4998779296875,
+      -1.4610552787780762
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 1148,
+    "title": "Theoretical Linguistics Rivals Embeddings in Language Clustering for Multilingual Named Entity Recognition",
+    "abstract": "While embedding-based methods have been dominant in language clustering for multilingual tasks, clustering based on linguistic features has not yet been explored much, as it remains baselines (Tan et al., 2019; Shaffer, 2021). This study investigates whether and how theoretical linguistics improves language clustering for multilingual named entity recognition (NER). We propose two types of language groupings: one based on morpho-syntactic features in a nominal domain and one based on a head parameter. Our NER experiments show that the proposed methods largely outperform a state-of-the-art embedding-based model, suggesting that theoretical linguistics plays a significant role in multilingual learning tasks.",
+    "authors": [
+      "Sakura Imai",
+      "Daisuke Kawahara",
+      "Naho Orita",
+      "Hiromune Oda"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.24",
+    "point2d": [
+      27.6635799407959,
+      -85.51258087158203
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1149,
+    "title": "Native Language Prediction from Gaze: a Reproducibility Study",
+    "abstract": "Numerous studies found that the linguistic properties of a person\u2019s native language affect the cognitive processing of other languages. However, only one study has shown that it was possible to identify the native language based on eye-tracking records of natural L2 reading using machine learning. A new corpus allows us to replicate these results on a more interrelated and larger set of native languages. Our results show that comparable classification performance is maintained despite using less data. However, analysis shows that the correlation between L2 eye movements and native language similarity may be more complex than the original study found.",
+    "authors": [
+      "Lina Skerath",
+      "Paulina Toborek",
+      "Anita Zieli\u0144ska",
+      "Maria Barrett",
+      "Rob Van Der Goot"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.26",
+    "point2d": [
+      -44.899600982666016,
+      -9.357335090637207
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1150,
+    "title": "MedTem2.0: Prompt-based Temporal Classification of Treatment Events from Discharge Summaries",
+    "abstract": "Discharge summaries are comprehensive medical records that encompass vital information about a patient\u2019s hospital stay. A crucial aspect of discharge summaries is the temporal information of treatments administered throughout the patient\u2019s illness. With an extensive volume of clinical documents, manually extracting and compiling a patient\u2019s medication list can be laborious, time-consuming, and susceptible to errors. The objective of this paper is to build upon the recent development on clinical NLP by temporally classifying treatments in clinical texts, specifically determining whether a treatment was administered between the time of admission and discharge from the hospital.State-of-the-art NLP methods including prompt-based learning on Generative Pre-trained Transformers (GPTs) models and fine-tuning on pre-trained language models (PLMs) such as BERT were employed to classify temporal relations between treatments and hospitalisation periods in discharge summaries. Fine-tuning with the BERT model achieved an F1 score of 92.45% and a balanced accuracy of 77.56%, while prompt learning using the T5 model and mixed templates resulted in an F1 score of 90.89% and a balanced accuracy of 72.07%.Our codes and data are available at https://github.com/HECTA-UoM/MedTem.",
+    "authors": [
+      "Yang Cui",
+      "Lifeng Han",
+      "Goran Nenadic"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.27",
+    "point2d": [
+      30.065776824951172,
+      -38.73167419433594
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1151,
+    "title": "Sudden Semantic Shifts in Swedish NATO discourse",
+    "abstract": "In this paper, we investigate a type of semantic shift that occurs when a sudden event radically changes public opinion on a topic. Looking at Sweden\u2019s decision to apply for NATO membership in 2022, we use word embeddings to study how the associations users on Twitter have regarding NATO evolve. We identify several changes that we successfully validate against real-world events. However, the low engagement of the public with the issue often made it challenging to distinguish true signals from noise. We thus find that domain knowledge and data selection are of prime importance when using word embeddings to study semantic shifts.",
+    "authors": [
+      "Brian Bonafilia",
+      "Bastiaan Bruinsma",
+      "Denitsa Saynova",
+      "Moa Johansson"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.28",
+    "point2d": [
+      33.59685516357422,
+      33.82600784301758
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1152,
+    "title": "Building a Buzzer-quiz Answering System",
+    "abstract": "A buzzer quiz is a genre of quiz in which multiple players simultaneously listen to a quiz being read aloud and respond it by buzzing in as soon as they can predict the answer.Because incorrect answers often result in penalties, a buzzer-quiz answering system must not only predict the answer from only part of a question but also estimate the predicted answer\u2019s accuracy.In this paper, we introduce two types of buzzer-quiz answering systems: (1) a system that directly generates an answer from part of a question by using an autoregressive language model; and (2) a system that first reconstructs the entire question by using an autoregressive language model and then determines the answer according to the reconstructed question.We then propose a method to estimate the accuracy of the answers for each system by using the internal scores of each model.",
+    "authors": [
+      "Naoya Sugiura",
+      "Kosuke Yamada",
+      "Ryohei Sasano",
+      "Koichi Takeda",
+      "Katsuhiko Toyama"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.29",
+    "point2d": [
+      61.996944427490234,
+      16.151832580566406
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1153,
+    "title": "Probing for Hyperbole in Pre-Trained Language Models",
+    "abstract": "Hyperbole is a common figure of speech, which is under-explored in NLP research. In this study, we conduct edge and minimal description length (MDL) probing experiments on three pre-trained language models (PLMs) in an attempt to explore the extent to which hyperbolic information is encoded in these models. We use both word-in-context and sentence-level representations as model inputs as a basis for comparison. We also annotate 63 hyperbole sentences from the HYPO dataset according to an operational taxonomy to conduct an error analysis to explore the encoding of different hyperbole categories. Our results show that hyperbole is to a limited extent encoded in PLMs, and mostly in the final layers. They also indicate that hyperbolic information may be better encoded by the sentence-level representations, which, due to the pragmatic nature of hyperbole, may therefore provide a more accurate and informative representation in PLMs. Finally, the inter-annotator agreement for our annotations, a Cohen\u2019s Kappa of 0.339, suggest that the taxonomy categories may not be intuitive and need revision or simplification.",
+    "authors": [
+      "Nina Schneidermann",
+      "Daniel Hershcovich",
+      "Bolette Pedersen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.30",
+    "point2d": [
+      -30.91903305053711,
+      -34.57668685913086
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 1154,
+    "title": "Towards Efficient Dialogue Processing in the Emergency Response Domain",
+    "abstract": "In this paper we describe the task of adapting NLP models to dialogue processing in the emergency response domain. Our goal is to provide a recipe for building a system that performs dialogue act classification and domain-specific slot tagging while being efficient, flexible and robust. We show that adapter models Pfeiffer et al. (2020) perform well in the emergency response domain and benefit from additional dialogue context and speaker information. Comparing adapters to standard fine-tuned Transformer models we show that they achieve competitive results and can easily accommodate new tasks without significant memory increase since the base model can be shared between the adapters specializing on different tasks. We also address the problem of scarce annotations in the emergency response domain and evaluate different data augmentation techniques in a low-resource setting.",
+    "authors": [
+      "Tatiana Anikina"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.31",
+    "point2d": [
+      5.580127239227295,
+      68.99396514892578
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1155,
+    "title": "I already said that! Degenerating redundant questions in open-domain dialogue systems.",
+    "abstract": "Neural text generation models have achieved remarkable success in carrying on short open-domain conversations. However, their performance degrades significantly in the long term, especially in their ability to ask coherent questions. A significant issue is the generation of redundant questions where the answer has already been provided by the user. We adapt and evaluate different methods, including negative training, decoding, and classification, to mitigate the redundancy problem. We also propose a simple yet effective method for generating training data without the need for crowdsourcing human-human or human-bot conversations. Experiments with the BlenderBot model show that our combined method significantly reduces the rate of redundant questions from 27.2% to 8.7%, while improving the quality of the original model. The code, dataset, and trained models can be found at our repository.",
+    "authors": [
+      "Long Mai",
+      "Julie Carson-berndsen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.33",
+    "point2d": [
+      6.857015609741211,
+      56.14161682128906
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 1156,
+    "title": "Is a Knowledge-based Response Engaging?: An Analysis on Knowledge-Grounded Dialogue with Information Source Annotation",
+    "abstract": "Currently, most knowledge-grounded dialogue response generation models focus on reflecting given external knowledge. However, even when conveying external knowledge, humans integrate their own knowledge, experiences, and opinions with external knowledge to make their utterances engaging. In this study, we analyze such human behavior by annotating the utterances in an existing knowledge-grounded dialogue corpus. Each entity in the corpus is annotated with its information source, either derived from external knowledge (database-derived) or the speaker\u2019s own knowledge, experiences, and opinions (speaker-derived). Our analysis shows that the presence of speaker-derived information in the utterance improves dialogue engagingness. We also confirm that responses generated by an existing model, which is trained to reflect the given knowledge, cannot include speaker-derived information in responses as often as humans do.",
+    "authors": [
+      "Takashi Kodama",
+      "Hirokazu Kiyomaru",
+      "Yin Jou Huang",
+      "Taro Okahisa",
+      "Sadao Kurohashi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.34",
+    "point2d": [
+      14.079499244689941,
+      56.22706604003906
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1157,
+    "title": "Choosing What to Mask: More Informed Masking for Multimodal Machine Translation",
+    "abstract": "Pre-trained language models have achieved remarkable results on several NLP tasks. Most of them adopt masked language modeling to learn representations by randomly masking tokens and predicting them based on their context. However, this random selection of tokens to be masked is inefficient to learn some language patterns as it may not consider linguistic information that can be helpful for many NLP tasks, such as multimodal machine translation (MMT). Hence, we propose three novel masking strategies for cross-lingual visual pre-training - more informed visual masking, more informed textual masking, and more informed visual and textual masking - each one focusing on learning different linguistic patterns. We apply them to Vision Translation Language Modelling for video subtitles (Sato et al., 2022) and conduct extensive experiments on the Portuguese-English MMT task. The results show that our masking approaches yield significant improvements over the original random masking strategy for downstream MMT performance. Our models outperform the MMT baseline and we achieve state-of-the-art accuracy (52.70 in terms of BLEU score) on the How2 dataset, indicating that more informed masking helps in acquiring an understanding of specific language structures and has great potential for language understanding.",
+    "authors": [
+      "Julia Sato",
+      "Helena Caseli",
+      "Lucia Specia"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.35",
+    "point2d": [
+      -68.02626037597656,
+      38.22868728637695
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1158,
+    "title": "Combining Tradition with Modernness: Exploring Event Representations in Vision-and-Language Models for Visual Goal-Step Inference",
+    "abstract": "Procedural knowledge understanding (PKU) underlies the ability to infer goal-step relations. The task of Visual Goal\u2013Step Inference addresses this ability in the multimodal domain. It requires to identify images that represent the steps towards achieving a textually expressed goal. The best existing methods encode texts and images either with independent encoders, or with object-level multimodal encoders using blackbox transformers. This stands in contrast to early, linguistically inspired methods for event representations, which focus on capturing the most crucial information, namely actions and the participants, to learn stereotypical event sequences and hence procedural knowledge. In this work, we study various methods and their effects on PKU of injecting the early shallow event representations to nowadays multimodal deep learning-based models. We find that the early, linguistically inspired methods for representing event knowledge does contribute to understand procedures in combination with modern vision-and-language models. In the future, we are going to explore more complex structure of events and study how to exploit it on top of large language models.",
+    "authors": [
+      "Chong Shen",
+      "Carina Silberer"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.36",
+    "point2d": [
+      55.59572219848633,
+      -8.573057174682617
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1159,
+    "title": "Data Selection for Fine-tuning Large Language Models Using Transferred Shapley Values",
+    "abstract": "Although Shapley values have been shown to be highly effective for identifying harmful training instances, dataset size and model complexity constraints limit the ability to apply Shapley-based data valuation to fine-tuning large pre-trained language models. To address this, we propose TS-DShapley, an algorithm that reduces computational cost of Shapley-based data valuation through: 1) an efficient sampling-based method that aggregates Shapley values computed from subsets for valuation of the entire training set, and 2) a value transfer method that leverages value information extracted from a simple classifier trained using representations from the target language model. Our experiments applying TS-DShapley to select data for fine-tuning BERT-based language models on benchmark natural language understanding (NLU) datasets show that TS-DShapley outperforms existing data selection methods. Further, TS-DShapley can filter fine-tuning data to increase language model performance compared to training with the full fine-tuning dataset.",
+    "authors": [
+      "Stephanie Schoch",
+      "Ritwick Mishra",
+      "Yangfeng Ji"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.37",
+    "point2d": [
+      -9.156288146972656,
+      -0.9793235063552856
+    ],
+    "cluster": 39.0
+  },
+  {
+    "idx": 1160,
+    "title": "Distractor Generation for Fill-in-the-Blank Exercises by Question Type",
+    "abstract": "This study addresses the automatic generation of distractors for English fill-in-the-blank exercises in the entrance examinations for Japanese universities. While previous studies applied the same method to all questions, actual entrance examinations have multiple question types that reflect the purpose of the questions. Therefore, we define three types of questions (grammar, function word, and context) and propose a method to generate distractors according to the characteristics of each question type. Experimental results on 500 actual questions show the effectiveness of the proposed method for both automatic and manual evaluation.",
+    "authors": [
+      "Nana Yoshimi",
+      "Tomoyuki Kajiwara",
+      "Satoru Uchida",
+      "Yuki Arase",
+      "Takashi Ninomiya"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.38",
+    "point2d": [
+      71.1431884765625,
+      12.241619110107422
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1161,
+    "title": "Moral Mimicry: Large Language Models Produce Moral Rationalizations Tailored to Political Identity",
+    "abstract": "Large Language Models (LLMs) have demonstrated impressive capabilities in generating fluent text, as well as tendencies to reproduce undesirable social biases. This work investigates whether LLMs reproduce the moral biases associated with political groups in the United States, an instance of a broader capability herein termed moral mimicry. This work explores this hypothesis in the GPT-3/3.5 and OPT families of Transformer-based LLMs. Using tools from Moral Foundations Theory, this work shows that these LLMs are indeed moral mimics. When prompted with a liberal or conservative political identity, the models generate text reflecting corresponding moral biases. This study also explores the relationship between moral mimicry and model size, and similarity between human and LLM moral word use.",
+    "authors": [
+      "Gabriel Simmons"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.40",
+    "point2d": [
+      26.834238052368164,
+      31.34027671813965
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1162,
+    "title": "LECO: Improving Early Exiting via Learned Exits and Comparison-based Exiting Mechanism",
+    "abstract": "Recently, dynamic early exiting has attracted much attention since it can accelerate the inference speed of pre-trained models (PTMs). However, previous work on early exiting has neglected the intermediate exits\u2019 architectural designs. In this work, we propose a novel framework, Learned Exits and COmparison-based early exiting (LECO) to improve PTMs\u2019 early exiting performances. First, to fully uncover the potentials of multi-exit BERT, we design a novel search space for intermediate exits and employ the idea of differentiable neural architecture search (DNAS) to design proper exit architectures for different intermediate layers automatically. Second, we propose a simple-yet-effective comparison-based early exiting mechanism (COBEE), which can help PTMs achieve better performance and speedup tradeoffs. Extensive experiments show that our LECO achieves the SOTA performances for multi-exit BERT training and dynamic early exiting.",
+    "authors": [
+      "Jingfan Zhang",
+      "Ming Tan",
+      "Pengyu Dai",
+      "Wei Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.43",
+    "point2d": [
+      -41.628536224365234,
+      -23.558216094970703
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1163,
+    "title": "Authorship Attribution of Late 19th Century Novels using GAN-BERT",
+    "abstract": "Authorship attribution aims to identify the author of an anonymous text. The task becomes even more worthwhile when it comes to literary works. For example, pen names were commonly used by female authors in the 19th century resulting in some literary works being incorrectly attributed or claimed. With this motivation, we collated a dataset of late 19th century novels in English. Due to the imbalance in the dataset and the unavailability of enough data per author, we employed the GANBERT model along with data sampling strategies to fine-tune a transformer-based model for authorship attribution. Differently from the earlier studies on the GAN-BERT model, we conducted transfer learning on comparatively smaller author subsets to train more focused author-specific models yielding performance over 0.88 accuracy and F1 scores. Furthermore, we observed that increasing the sample size has a negative impact on the model\u2019s performance. Our research mainly contributes to the ongoing authorship attribution research using GAN-BERT architecture, especially in attributing disputed novelists in the late 19th century.",
+    "authors": [
+      "Kanishka Silva",
+      "Burcu Can",
+      "Fr\u00e9d\u00e9ric Blain",
+      "Raheem Sarwar",
+      "Laura Ugolini",
+      "Ruslan Mitkov"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.44",
+    "point2d": [
+      -5.3005571365356445,
+      24.778121948242188
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1164,
+    "title": "How-to Guides for Specific Audiences: A Corpus and Initial Findings",
+    "abstract": "Instructional texts for specific target groups should ideally take into account the prior knowledge and needs of the readers in order to guide them efficiently to their desired goals. However, targeting specific groups also carries the risk of reflecting disparate social norms and subtle stereotypes. In this paper, we investigate the extent to which how-to guides from one particular platform, wikiHow, differ in practice depending on the intended audience. We conduct two case studies in which we examine qualitative features of texts written for specific audiences. In a generalization study, we investigate which differences can also be systematically demonstrated using computational methods. The results of our studies show that guides from wikiHow, like other text genres, are subject to subtle biases. We aim to raise awareness of these inequalities as a first step to addressing them in future work.",
+    "authors": [
+      "Nicola Fanton",
+      "Agnieszka Falenska",
+      "Michael Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.46",
+    "point2d": [
+      25.049884796142578,
+      19.002775192260742
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1165,
+    "title": "\u201cWhen Words Fail, Emojis Prevail\u201d: A Novel Architecture for Generating Sarcastic Sentences With Emoji Using Valence Reversal and Semantic Incongruity",
+    "abstract": "Sarcasm is a form of figurative language that serves as a humorous tool for mockery and ridicule. We present a novel architecture for sarcasm generation with emoji from a non-sarcastic input sentence in English. We divide the generation task into two sub tasks: one for generating textual sarcasm and another for collecting emojis associated with those sarcastic sentences. Two key elements of sarcasm are incorporated into the textual sarcasm generation task: valence reversal and semantic incongruity with context, where the context may involve shared commonsense or general knowledge between the speaker and their audience. The majority of existing sarcasm generation works have focused on this textual form. However, in the real world, when written texts fall short of effectively capturing the emotional cues of spoken and face-to-face communication, people often opt for emojis to accurately express their emotions. Due to the wide range of applications of emojis, incorporating appropriate emojis to generate textual sarcastic sentences helps advance sarcasm generation. We conclude our study by evaluating the generated sarcastic sentences using human judgement. All the codes and data used in this study has been made publicly available.",
+    "authors": [
+      "Faria Binte Kader",
+      "Nafisa Hossain Nujat",
+      "Tasmia Binte Sogir",
+      "Mohsinul Kabir",
+      "Hasan Mahmud",
+      "Md Kamrul Hasan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.47",
+    "point2d": [
+      -23.595317840576172,
+      61.66615676879883
+    ],
+    "cluster": 34.0
+  },
+  {
+    "idx": 1166,
+    "title": "Semantic Accuracy in Natural Language Generation: A Thesis Proposal",
+    "abstract": "With the fast-growing popularity of current large pre-trained language models (LLMs), it is necessary to dedicate efforts to making them more reliable. In this thesis proposal, we aim to improve the reliability of natural language generation systems (NLG) by researching the semantic accuracy of their outputs. We look at this problem from the outside (evaluation) and from the inside (interpretability). We propose a novel method for evaluating semantic accuracy and discuss the importance of working towards a unified and objective benchmark for NLG metrics. We also review interpretability approaches which could help us pinpoint the sources of inaccuracies within the models and explore potential mitigation strategies.",
+    "authors": [
+      "Patricia Schmidtova"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.48",
+    "point2d": [
+      -21.35477066040039,
+      14.670857429504395
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1167,
+    "title": "Math Word Problem Solving by Generating Linguistic Variants of Problem Statements",
+    "abstract": "The art of mathematical reasoning stands as a fundamental pillar of intellectual progress and is a central catalyst in cultivating human ingenuity. Researchers have recently published a plethora of works centered around the task of solving Math Word Problems (MWP) \u2014 a crucial stride towards general AI. These existing models are susceptible to dependency on shallow heuristics and spurious correlations to derive the solution expressions. In order to ameliorate this issue, in this paper, we propose a framework for MWP solvers based on the generation of linguistic variants of the problem text. The approach involves solving each of the variant problems and electing the predicted expression with the majority of the votes. We use DeBERTa (Decoding-enhanced BERT with disentangled attention) as the encoder to leverage its rich textual representations and enhanced mask decoder to construct the solution expressions. Furthermore, we introduce a challenging dataset, ParaMAWPS, consisting of paraphrased, adversarial, and inverse variants of selectively sampled MWPs from the benchmark Mawps dataset. We extensively experiment on this dataset along with other benchmark datasets using some baseline MWP solver models. We show that training on linguistic variants of problem statements and voting on candidate predictions improve the mathematical reasoning and robustness of the model. We make our code and data publicly available.",
+    "authors": [
+      "Syed Rifat Raiyan",
+      "Md Nafis Faiyaz",
+      "Shah Md. Jawad Kabir",
+      "Mohsinul Kabir",
+      "Hasan Mahmud",
+      "Md Kamrul Hasan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "srw",
+    "doi": "10.18653/v1/2023.acl-srw.49",
+    "point2d": [
+      46.009613037109375,
+      -21.459619522094727
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 1168,
+    "title": "CWSeg: An Efficient and General Approach to Chinese Word Segmentation",
+    "abstract": "In this work, we report our efforts in advancing Chinese Word Segmentation for the purpose of rapid deployment in different applications. The pre-trained language model (PLM) based segmentation methods have achieved state-of-the-art (SOTA) performance, whereas this paradigm also poses challenges in the deployment. It includes the balance between performance and cost, segmentation ambiguity due to domain diversity and vague words boundary, and multi-grained segmentation. In this context, we propose a simple yet effective approach, namely CWSeg, to augment PLM-based schemes by developing cohort training and versatile decoding strategies. Extensive experiments on benchmark datasets demonstrate the efficiency and generalization of our approach. The corresponding segmentation system is also implemented for practical usage and the demo is recorded.",
+    "authors": [
+      "Dedong Li",
+      "Rui Zhao",
+      "Fei Tan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.1",
+    "point2d": [
+      -46.2789192199707,
+      7.6127729415893555
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1169,
+    "title": "\u201cKnowledge is Power\u201d: Constructing Knowledge Graph of Abdominal Organs and Using Them for Automatic Radiology Report Generation",
+    "abstract": "In conventional radiology practice, the radiologist dictates the diagnosis to the transcriptionist, who then prepares a preliminary formatted report referring to the notes, after which the radiologist reviews the report, corrects the errors, and signs off. This workflow is prone to delay and error. In this paper, we report our work on automatic radiology report generation from radiologists\u2019 dictation, which is in collaboration with a startup about to become Unicorn. A major contribution of our work is the set of knowledge graphs (KGs) of ten abdominal organs- Liver, Kidney, Gallbladder, Uterus, Urinary bladder, Ovary, Pancreas, Prostate, Biliary Tree, and Bowel. Our method for constructing these KGs relies on extracting entity1-relation-entity2 triplets from a large collection (about 10,000) of free-text radiology reports. The quality and coverage of the KGs are verified by two experienced radiologists (practicing for the last 30 years and 8 years, respectively). The dictation of the radiologist is automatically converted to what is called a pathological description which is the clinical description of the findings of the radiologist during ultrasonography (USG). Our knowledge-enhanced deep learning model improves the reported BLEU-3, ROUGE-L, METEOR, and CIDEr scores of the pathological description generation by 2%, 4%, 2% and 2% respectively. To the best of our knowledge, this is the first attempt at representing the abdominal organs in the form of knowledge graphs and utilising these graphs for the automatic generation of USG reports. A Minimum Viable Product (MVP) has been made available to the beta users, i.e., radiologists of reputed hospitals, for testing and evaluation. Our solution guarantees report generation within 30 seconds of running a scan.",
+    "authors": [
+      "Kaveri Kale",
+      "Pushpak Bhattacharyya",
+      "Aditya Shetty",
+      "Milind Gune",
+      "Kush Shrivastava",
+      "Rustom Lawyer",
+      "Spriha Biswas"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.2",
+    "point2d": [
+      33.06934356689453,
+      -34.001155853271484
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1170,
+    "title": "Hunt for Buried Treasures: Extracting Unclaimed Embodiments from Patent Specifications",
+    "abstract": "Patent applicants write patent specificationsthat describe embodiments of inventions.Some embodiments are claimed for a patent,while others may be unclaimeddue to strategic considerations.Unclaimed embodiments may be extracted byapplicants later and claimed incontinuing applications togain advantages over competitors.Despite being essential for corporate intellectual property (IP) strategies,unclaimed embodiment extraction is conducted manually,and little research has been conducted on its automation.This paper presents a novel task ofunclaimed embodiment extraction (UEE)and a novel dataset for the task.Our experiments with Transformer-based modelsdemonstratedthat the task was challenging as it requiredconducting natural language inference onpatent specifications, which consisted oftechnical, long, syntactically and semanticallyinvolved sentences.We release the dataset and code to foster this new area of research.",
+    "authors": [
+      "Chikara Hashimoto",
+      "Gautam Kumar",
+      "Shuichiro Hashimoto",
+      "Jun Suzuki"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.3",
+    "point2d": [
+      25.296100616455078,
+      -2.497847318649292
+    ],
+    "cluster": 31.0
+  },
+  {
+    "idx": 1171,
+    "title": "MathPrompter: Mathematical Reasoning using Large Language Models",
+    "abstract": "Large Language Models (LLMs) have limited performance when solving arithmetic reasoning tasks and often provide incorrect answers. Unlike natural language understanding, math problems typically have a single correct answer, making the task of generating accurate solutions more challenging for LLMs. To the best of our knowledge, we are not aware of any LLMs that indicate their level of confidence in their responses which fuels a trust deficit in these models impeding their adoption. To address this deficiency, we propose \u2018MathPrompter\u2019, a technique that improves performance of LLMs on arithmetic problems along with increased reliance in the predictions. MathPrompter uses the Zero-shot chain-of-thought prompting technique to generate multiple algebraic expressions or python functions to solve the same math problem in different ways and thereby raise the confidence level in the output results. This is in contrast to other prompt based CoT methods, where there is no check on the validity of the intermediate steps followed. Our technique improves over state-of-the-art on the \u2018MultiArith\u2019 dataset (78.7% - 92.5%) evaluated using 175B parameter GPT-based LLM.",
+    "authors": [
+      "Shima Imani",
+      "Liang Du",
+      "Harsh Shrivastava"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.4",
+    "point2d": [
+      43.83344650268555,
+      -20.69260597229004
+    ],
+    "cluster": 12.0
+  },
+  {
+    "idx": 1172,
+    "title": "Constrained Policy Optimization for Controlled Self-Learning in Conversational AI Systems",
+    "abstract": "Recently, self-learning methods based on user satisfaction metrics and contextual bandits have shown promising results to enable consistent improvements in conversational AI systems. However, directly targeting such metrics by off-policy bandit learning objectives often increases the risk of making abrupt policy changes that break the current user experience. In this study, we introduce a scalable framework for supporting fine-grained exploration targets for individual domains via user-defined constraints. For example, we may want to ensure fewer policy deviations in business-critical domains such as shopping, while allocating more exploration budget to domains such as music. We present a novel meta-gradient learning approach that is scalable and practical to address this problem. The proposed method adjusts constraint violation penalty terms adaptively through a meta objective that encourages balanced constraint satisfaction across domains. We conducted extensive experiments on a real-world conversational AI and using a set of realistic constraint benchmarks. The proposed approach has been deployed in production for a large-scale commercial assistant, enabling the best balance between the policy value and constraint satisfaction rate.",
+    "authors": [
+      "Mohammad Kachuee",
+      "Sungjin Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.5",
+    "point2d": [
+      19.59737777709961,
+      77.89226531982422
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1173,
+    "title": "pNLP-Mixer: an Efficient all-MLP Architecture for Language",
+    "abstract": "Large pre-trained language models based on transformer architecture\u0192have drastically changed the natural language processing (NLP) landscape. However, deploying those models for on-device applications in constrained devices such as smart watches is completely impractical due to their size and inference cost. As an alternative to transformer-based architectures, recent work on efficient NLP has shown that weight-efficient models can attain competitive performance for simple tasks, such as slot filling and intent classification, with model sizes in the order of the megabyte. This work introduces the pNLP-Mixer architecture, an embedding-free MLP-Mixer model for on-device NLP that achieves high weight-efficiency thanks to a novel projection layer. We evaluate a pNLP-Mixer model of only one megabyte in size on two multi-lingual semantic parsing datasets, MTOP and multiATIS. Our quantized model achieves 99.4% and 97.8% the performance of mBERT on MTOP and multiATIS, while using 170x less parameters. Our model consistently beats the state-of-the-art of tiny models (pQRNN), which is twice as large, by a margin up to 7.8% on MTOP.",
+    "authors": [
+      "Francesco Fusco",
+      "Damian Pascual",
+      "Peter Staar",
+      "Diego Antognini"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.6",
+    "point2d": [
+      -52.31547546386719,
+      15.851584434509277
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 1174,
+    "title": "Extracting Text Representations for Terms and Phrases in Technical Domains",
+    "abstract": "Extracting dense representations for terms and phrases is a task of great importance for knowledge discovery platforms targeting highly-technical fields. Dense representations are used as features for downstream components and have multiple applications ranging from ranking results in search to summarization. Common approaches to create dense representations include training domain-specific embeddings with self-supervised setups or using sentence encoder models trained over similarity tasks. In contrast to static embeddings, sentence encoders do not suffer from the out-of-vocabulary (OOV) problem, but impose significant computational costs. In this paper, we propose a fully unsupervised approach to text encoding that consists of training small character-based models with the objective of reconstructing large pre-trained embedding matrices. Models trained with this approach can not only match the quality of sentence encoders in technical domains, but are 5 times smaller and up to 10 times faster, even on high-end GPUs.",
+    "authors": [
+      "Francesco Fusco",
+      "Diego Antognini"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.7",
+    "point2d": [
+      -0.39339402318000793,
+      -35.2827262878418
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 1175,
+    "title": "CocaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval",
+    "abstract": "Large-scale pre-trained text-image models with dual-encoder architectures (such as CLIP) are typically adopted for various vision-language applications, including text-image retrieval. However, these models are still less practical on edge devices or for real-time situations, due to the substantial indexing and inference time and the large consumption of computational resources. Although knowledge distillation techniques have been widely utilized for uni-modal model compression, how to expand them to the situation when the numbers of modalities and teachers/students are doubled has been rarely studied. In this paper, we conduct comprehensive experiments on this topic and propose the fully-Connected knowledge interaction graph (Coca) technique for cross-modal pre-training distillation. Based on our findings, the resulting CocaCLIP achieves SOTA performances on the widely-used Flickr30K and MSCOCO benchmarks under the lightweight setting. An industry application of our method on an e-commercial platform further demonstrates the significant effectiveness of CocaCLIP.",
+    "authors": [
+      "Jiapeng Wang",
+      "Chengyu Wang",
+      "Xiaodan Wang",
+      "Jun Huang",
+      "Lianwen Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.8",
+    "point2d": [
+      -57.93296432495117,
+      37.8043098449707
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 1176,
+    "title": "KG-FLIP: Knowledge-guided Fashion-domain Language-Image Pre-training for E-commerce",
+    "abstract": "Various Vision-Language Pre-training (VLP) models (e.g., CLIP, BLIP) have sprung up and dramatically advanced the benchmarks for public general-domain datasets (e.g., COCO, Flickr30k). Such models usually learn the cross-modal alignment from large-scale well-aligned image-text datasets without leveraging external knowledge. Adapting these models to downstream applications in specific domains like fashion requires fine-grained in-domain image-text corpus, which are usually less semantically aligned and in small scale that requires efficient pre-training strategies. In this paper, we propose a knowledge-guided fashion-domain language-image pre-training (FLIP) framework that focuses on learning fine-grained representations in e-commerce domain and utilizes external knowledge (i.e., product attribute schema), to improve the pre-training efficiency. Experiments demonstrate that FLIP outperforms previous state-of-the-art VLP models on Amazon data and on the Fashion-Gen dataset by large margins. FLIP has been successfully deployed in the Amazon catalog system to backfill missing attributes and improve the customer shopping experience.",
+    "authors": [
+      "Qinjin Jia",
+      "Yang Liu",
+      "Daoping Wu",
+      "Shaoyuan Xu",
+      "Huidong Liu",
+      "Jinmiao Fu",
+      "Roland Vollgraf",
+      "Bryan Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.9",
+    "point2d": [
+      -57.77860641479492,
+      40.451683044433594
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 1177,
+    "title": "Domain-specific transformer models for query translation",
+    "abstract": "Due to the democratization of e-commerce, many product companies are listing their goods for online shopping. For periodic buying within a domain such as Grocery, consumers are generally inclined to buy certain brands of products.Due to a large non-English speaking population in India, we observe a significant percentage of code-mix Hinglish search queries e.g., sasta atta. An intuitive approach to dealing with code-mix queries is to train an encoder-decoder model to translate the query to English to perform the search. However, the problem becomes non-trivial when the brand names themselves have Hinglish names and possibly have a literal English translation. In such queries, only the context (non-brand name) Hinglish words needs to be translated. In this paper, we propose a simple yet effective modification to the transformer training to preserve/correct Grocery brand names in the output while selectively translating the context words. To achieve this, we use an additional dataset of popular Grocery brand names. Brand names are added as tokens to the model vocabulary, and the token embeddings are randomly initialized. Further, we introduce a Brand loss in training the translation model. Brand loss is a cross entropy loss computed using a denoising auto-encoder objective with brand name data. We warm-start the training from a public pre-trained checkpoint (such as BART/T5) and further adapt it for query translation using the domain data. The proposed model is generic and can be used with English as well as code-mix Hinglish queries alleviating the need for language detection. To reduce the latency of the model for the production deployment, we use knowledge distillation and quantization. Experimental evaluation indicates that the proposed approach improves translation results by preserving/correcting English/Hinglish brand names. After positive results with A/B testing, the model is currently deployed in production.",
+    "authors": [
+      "Mandar Kulkarni",
+      "Nikesh Garera",
+      "Anusua Trivedi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.10",
+    "point2d": [
+      -65.70652770996094,
+      -17.072006225585938
+    ],
+    "cluster": 21.0
+  },
+  {
+    "idx": 1178,
+    "title": "Label efficient semi-supervised conversational intent classification",
+    "abstract": "To provide a convenient shopping experience and to answer user queries at scale, conversational platforms are essential for e-commerce. The user queries can be pre-purchase questions, such as product specifications and delivery time related, or post-purchase queries, such as exchange and return. A chatbot should be able to understand and answer a variety of such queries to help users with relevant information. One of the important modules in the chatbot is automated intent identification, i.e., understanding the user\u2019s intention from the query text. Due to non-English speaking users interacting with the chatbot, we often get a significant percentage of code mix queries and queries with grammatical errors, which makes the problem more challenging. This paper proposes a simple yet competent Semi-Supervised Learning (SSL) approach for label-efficient intent classification. We use a small labeled corpus and relatively larger unlabeled query data to train a transformer model. For training the model with labeled data, we explore supervised MixUp data augmentation. To train with unlabeled data, we explore label consistency with dropout noise. We experiment with different pre-trained transformer architectures, such as BERT and sentence-BERT. Experimental results demonstrate that the proposed approach significantly improves over the supervised baseline, even with a limited labeled set. A variant of the model is currently deployed in production.",
+    "authors": [
+      "Mandar Kulkarni",
+      "Kyung Kim",
+      "Nikesh Garera",
+      "Anusua Trivedi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.11",
+    "point2d": [
+      -8.877307891845703,
+      73.18698120117188
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 1179,
+    "title": "xPQA: Cross-Lingual Product Question Answering in 12 Languages",
+    "abstract": "Product Question Answering (PQA) systems are key in e-commerce applications as they provide responses to customers\u2019 questions as they shop for products. While existing work on PQA focuses mainly on English, in practice there is need to support multiple customer languages while leveraging product information available in English. To study this practical industrial task, we present xPQA, a large-scale annotated cross-lingual PQA dataset in 12 languages, and report results in (1) candidate ranking, to select the best English candidate containing the information to answer a non-English question; and (2) answer generation, to generate a natural-sounding non-English answer based on the selected English candidate.We evaluate various approaches involving machine translation at runtime or offline, leveraging multilingual pre-trained LMs, and including or excluding xPQA training data. We find that in-domain data is essential as cross-lingual rankers trained on other domains perform poorly on the PQA task, and that translation-based approaches are most effective for candidate ranking while multilingual finetuning works best for answer generation. Still, there remains a significant performance gap between the English and the cross-lingual test sets.",
+    "authors": [
+      "Xiaoyu Shen",
+      "Akari Asai",
+      "Bill Byrne",
+      "Adria De Gispert"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.12",
+    "point2d": [
+      66.63784790039062,
+      18.845914840698242
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1180,
+    "title": "Learn over Past, Evolve for Future: Forecasting Temporal Trends for Fake News Detection",
+    "abstract": "Fake news detection has been a critical task for maintaining the health of the online news ecosystem. However, very few existing works consider the temporal shift issue caused by the rapidly-evolving nature of news data in practice, resulting in significant performance degradation when training on past data and testing on future data. In this paper, we observe that the appearances of news events on the same topic may display discernible patterns over time, and posit that such patterns can assist in selecting training instances that could make the model adapt better to future data. Specifically, we design an effective framework FTT (Forecasting Temporal Trends), which could forecast the temporal distribution patterns of news data and then guide the detector to fast adapt to future distribution. Experiments on the real-world temporally split dataset demonstrate the superiority of our proposed framework.",
+    "authors": [
+      "Beizhe Hu",
+      "Qiang Sheng",
+      "Juan Cao",
+      "Yongchun Zhu",
+      "Danding Wang",
+      "Zhengjia Wang",
+      "Zhiwei Jin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.13",
+    "point2d": [
+      35.91476821899414,
+      19.69803237915039
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1181,
+    "title": "AVEN-GR: Attribute Value Extraction and Normalization using product GRaphs",
+    "abstract": "Getting a good understanding of the user intent is vital for e-commerce applications to surface the right product to a given customer query. Query Understanding (QU) systems are essential for this purpose, and many e-commerce providers are working on complex solutions that need to be data efficient and able to capture early emerging market trends. Query Attribute Understanding (QAU) is a sub-component of QU that involves extracting named attributes from user queries and linking them to existing e-commerce entities such as brand, material, color, etc. While extracting named entities from text has been extensively explored in the literature, QAU requires specific attention due to the nature of the queries, which are often short, noisy, ambiguous, and constantly evolving. This paper makes three contributions to QAU. First, we propose a novel end-to-end approach that jointly solves Named Entity Recognition (NER) and Entity Linking (NEL) and enables open-world reasoning for QAU. Second, we introduce a novel method for utilizing product graphs to enhance the representation of query entities. Finally, we present a new dataset constructed from public sources that can be used to evaluate the performance of future QAU systems.",
+    "authors": [
+      "Thomas Ricatte",
+      "Donato Crisostomi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.14",
+    "point2d": [
+      27.526050567626953,
+      -61.281856536865234
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 1182,
+    "title": "GKD: A General Knowledge Distillation Framework for Large-scale Pre-trained Language Model",
+    "abstract": "Currently, the reduction in the parameter scale of large-scale pre-trained language models (PLMs) through knowledge distillation has greatly facilitated their widespread deployment on various devices. However, the deployment of knowledge distillation systems faces great challenges in real-world industrial-strength applications, which require the use of complex distillation methods on even larger-scale PLMs (over 10B), limited by memory on GPUs and the switching of methods. To overcome these challenges, we propose GKD, a general knowledge distillation framework that supports distillation on larger-scale PLMs using various distillation methods. With GKD, developers can build larger distillation models on memory-limited GPUs and easily switch and combine different distillation methods within a single framework. Experimental results show that GKD can support the distillation of at least 100B-scale PLMs and 25 mainstream methods on 8 NVIDIA A100 (40GB) GPUs.",
+    "authors": [
+      "Shicheng Tan",
+      "Weng Lam Tam",
+      "Yuanchun Wang",
+      "Wenwen Gong",
+      "Shu Zhao",
+      "Peng Zhang",
+      "Jie Tang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.15",
+    "point2d": [
+      -46.18995666503906,
+      -20.261398315429688
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1183,
+    "title": "FashionKLIP: Enhancing E-Commerce Image-Text Retrieval with Fashion Multi-Modal Conceptual Knowledge Graph",
+    "abstract": "Image-text retrieval is a core task in the multi-modal domain, which arises a lot of attention from both research and industry communities. Recently, the booming of visual-language pre-trained (VLP) models has greatly enhanced the performance of cross-modal retrieval. However, the fine-grained interactions between objects from different modalities are far from well-established. This issue becomes more severe in the e-commerce domain, which lacks sufficient training data and fine-grained cross-modal knowledge. To alleviate the problem, this paper proposes a novel e-commerce knowledge-enhanced VLP model FashionKLIP. We first automatically establish a multi-modal conceptual knowledge graph from large-scale e-commerce image-text data, and then inject the prior knowledge into the VLP model to align across modalities at the conceptual level. The experiments conducted on a public benchmark dataset demonstrate that FashionKLIP effectively enhances the performance of e-commerce image-text retrieval upon state-of-the-art VLP models by a large margin. The application of the method in real industrial scenarios also proves the feasibility and efficiency of FashionKLIP.",
+    "authors": [
+      "Xiaodan Wang",
+      "Chengyu Wang",
+      "Lei Li",
+      "Zhixu Li",
+      "Ben Chen",
+      "Linbo Jin",
+      "Jun Huang",
+      "Yanghua Xiao",
+      "Ming Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.16",
+    "point2d": [
+      -57.6603889465332,
+      40.07908248901367
+    ],
+    "cluster": 26.0
+  },
+  {
+    "idx": 1184,
+    "title": "Entity Contrastive Learning in a Large-Scale Virtual Assistant System",
+    "abstract": "Conversational agents are typically made up of domain (DC) and intent classifiers (IC) that identify the general subject an utterance belongs to and the specific action a user wishes to achieve. In addition, named entity recognition (NER) performs per token labeling to identify specific entities of interest in a spoken utterance. We investigate improving joint IC and NER models using entity contrastive learning that attempts to cluster similar entities together in a learned representation space. We compare a full virtual assistant system trained using entity contrastive learning to a production baseline system that does not use contrastive learning. We present both offline results, using retrospective test sets, as well as live online results from an A/B test that compared the two systems. In both the offline and online settings, entity contrastive training improved overall performance against production baselines. Furthermore, we provide a detailed analysis of learned entity embeddings, including both qualitative analysis via dimensionality-reduced visualizations and quantitative analysis by computing alignment and uniformity metrics. We show that entity contrastive learning improves alignment metrics and produces well-formed embedding clusters in representation space.",
+    "authors": [
+      "Jonathan Rubin",
+      "Jason Crowley",
+      "George Leung",
+      "Morteza Ziyadi",
+      "Maria Minakova"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.17",
+    "point2d": [
+      43.40408706665039,
+      5.593143939971924
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1185,
+    "title": "Tab-Cleaner: Weakly Supervised Tabular Data Cleaning via Pre-training for E-commerce Catalog",
+    "abstract": "Product catalogs, conceptually in the form of text-rich tables, are self-reported by individual retailers and thus inevitably contain noisy facts. Verifying such textual attributes in product catalogs is essential to improve their reliability. However, popular methods for processing free-text content, such as pre-trained language models, are not particularly effective on structured tabular data since they are typically trained on free-form natural language texts. In this paper, we present Tab-Cleaner, a model designed to handle error detection over text-rich tabular data following a pre-training / fine-tuning paradigm. We train Tab-Cleaner on a real-world Amazon Product Catalog table w.r.t millions of products and show improvements over state-of-the-art methods by 16\\% on PR AUC over attribute applicability classification task and by 11\\% on PR AUC over attribute value validation task.",
+    "authors": [
+      "Kewei Cheng",
+      "Xian Li",
+      "Zhengyang Wang",
+      "Chenwei Zhang",
+      "Binxuan Huang",
+      "Yifan Ethan Xu",
+      "Xin Luna Dong",
+      "Yizhou Sun"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.18",
+    "point2d": [
+      24.250877380371094,
+      -59.01813507080078
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 1186,
+    "title": "Toward More Accurate and Generalizable Evaluation Metrics for Task-Oriented Dialogs",
+    "abstract": "Measurement of interaction quality is a critical task for the improvement of large-scale spoken dialog systems. Existing approaches to dialog quality estimation either focus on evaluating the quality of individual turns, or collect dialog-level quality measurements from end users immediately following an interaction. In contrast to these approaches, we introduce a new dialog-level annotation workflow called Dialog Quality Annotation (DQA). DQA expert annotators evaluate the quality of dialogs as a whole, and also label dialogs for attributes such as goal completion and user sentiment. In this contribution, we show that: (i) while dialog quality cannot be completely decomposed into dialog-level attributes, there is a strong relationship between some objective dialog attributes and judgments of dialog quality; (ii) for the task of dialog-level quality estimation, a supervised model trained on dialog-level annotations outperforms methods based purely on aggregating turn-level features; and (iii) the proposed evaluation model shows better domain generalization ability compared to the baselines. On the basis of these results, we argue that having high-quality human-annotated data is an important component of evaluating interaction quality for large industrial-scale voice assistant platforms.",
+    "authors": [
+      "Abishek Komma",
+      "Nagesh Panyam Chandrasekarasastry",
+      "Timothy Leffel",
+      "Anuj Goyal",
+      "Angeliki Metallinou",
+      "Spyros Matsoukas",
+      "Aram Galstyan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.19",
+    "point2d": [
+      15.18218994140625,
+      70.3621597290039
+    ],
+    "cluster": 24.0
+  },
+  {
+    "idx": 1187,
+    "title": "Tab-CQA: A Tabular Conversational Question Answering Dataset on Financial Reports",
+    "abstract": "Existing conversational question answering (CQA) datasets have been usually constructed from unstructured texts in English. In this paper, we propose Tab-CQA, a tabular CQA dataset created from Chinese financial reports that are extracted from listed companies in a wide range of different sectors in the past 30 years. From these reports, we select 2,463 tables, and manually generate 2,463 conversations with 35,494 QA pairs. Additionally, we select 4,578 tables, from which 4,578 conversations with 73,595 QA pairs are automatically created via a template-based method. With the manually- and automatically-generated conversations, Tab-CQA contains answerable and unanswerable questions. For the answerable questions, we further diversify them to cover a wide range of skills, e.g., table retrieval, fact checking, numerical reasoning, so as to accommodate real-world scenarios. We further propose two different tabular CQA models, a text-based model and an operation-based model, and evaluate them on Tab-CQA. Experiment results show that Tab-CQA is a very challenging dataset, where a huge performance gap exists between human and neural models. We will publicly release Tab-CQA as a benchmark testbed to promote further research on Chinese tabular CQA.",
+    "authors": [
+      "Chuang Liu",
+      "Junzhuo Li",
+      "Deyi Xiong"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.20",
+    "point2d": [
+      76.29862213134766,
+      4.013925552368164
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1188,
+    "title": "KoSBI: A Dataset for Mitigating Social Bias Risks Towards Safer Large Language Model Applications",
+    "abstract": "Large language models (LLMs) not only learn natural text generation abilities but also social biases against different demographic groups from real-world data. This poses a critical risk when deploying LLM-based applications. Existing research and resources are not readily applicable in South Korea due to the differences in language and culture, both of which significantly affect the biases and targeted demographic groups. This limitation requires localized social bias datasets to ensure the safe and effective deployment of LLMs. To this end, we present KosBi, a new social bias dataset of 34k pairs of contexts and sentences in Korean covering 72 demographic groups in 15 categories. We find that through filtering-based moderation, social biases in generated content can be reduced by 16.47%p on average for HyperClova (30B and 82B), and GPT-3.",
+    "authors": [
+      "Hwaran Lee",
+      "Seokhee Hong",
+      "Joonsuk Park",
+      "Takyoung Kim",
+      "Gunhee Kim",
+      "Jung-woo Ha"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.21",
+    "point2d": [
+      17.705347061157227,
+      31.790369033813477
+    ],
+    "cluster": 10.0
+  },
+  {
+    "idx": 1189,
+    "title": "Improving Knowledge Production Efficiency With Question Answering on Conversation",
+    "abstract": "Through an online customer service application, we have collected many conversations between customer service agents and customers. Building a knowledge production system can help reduce the labor cost of maintaining the FAQ database for the customer service chatbot, whose core module is question answering (QA) on these conversations. However, most existing researches focus on document-based QA tasks, and there is a lack of researches on conversation-based QA and related datasets, especially in Chinese language. The challenges of conversation-based QA include: 1) answers may be scattered among multiple dialogue turns; 2) understanding complex dialogue contexts is more complicated than documents. To address these challenges, we propose a multi-span extraction model on this task and introduce continual pre-training and multi-task learning schemes to further improve model performance. To validate our approach, we construct two Chinese datasets using dialogues as the knowledge source, namely cs-qaconv and kd-qaconv, respectively. Experimental results demonstrate that the proposed model outperforms the baseline on both datasets. The online application also verifies the effectiveness of our method. The dataset kd-qaconv will be released publicly for research purposes.",
+    "authors": [
+      "Changlin Yang",
+      "Siye Liu",
+      "Sen Hu",
+      "Wangshu Zhang",
+      "Teng Xu",
+      "Jing Zheng"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.22",
+    "point2d": [
+      19.165931701660156,
+      55.33280944824219
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1190,
+    "title": "Mitigating the Burden of Redundant Datasets via Batch-Wise Unique Samples and Frequency-Aware Losses",
+    "abstract": "Datasets used to train deep learning models in industrial settings often exhibit skewed distributions with some samples repeated a large number of times.This paper presents a simple yet effective solution to reduce the increased burden of repeated computation on redundant datasets.Our approach eliminates duplicates at the batch level, without altering the data distribution observed by the model, making it model-agnostic and easy to implement as a plug-and-play module. We also provide a mathematical expression to estimate the reduction in training time that our approach provides. Through empirical evidence, we show that our approach significantly reduces training times on various models across datasets with varying redundancy factors, without impacting their performance on the Named Entity Recognition task, both on publicly available datasets and in real industrial settings.In the latter, the approach speeds training by up to 87%, and by 46% on average, with a drop in model performance of 0.2% relative at worst.We finally release a modular and reusable codebase to further advance research in this area.",
+    "authors": [
+      "Donato Crisostomi",
+      "Andrea Caciolai",
+      "Alessandro Pedrani",
+      "Kay Rottmann",
+      "Alessandro Manzotti",
+      "Enrico Palumbo",
+      "Davide Bernardi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.23",
+    "point2d": [
+      -30.492380142211914,
+      -12.54419231414795
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1191,
+    "title": "The economic trade-offs of large language models: A case study",
+    "abstract": "Contacting customer service via chat is a common practice. Because employing customer service agents is expensive, many companies are turning to NLP that assists human agents by auto-generating responses that can be used directly or with modifications. With their ability to handle large context windows, Large Language Models (LLMs) are a natural fit for this use case. However, their efficacy must be balanced with the cost of training and serving them. This paper assesses the practical cost and impact of LLMs for the enterprise as a function of the usefulness of the responses that they generate. We present a cost framework for evaluating an NLP model\u2019s utility for this use case and apply it to a single brand as a case study in the context of an existing agent assistance product. We compare three strategies for specializing an LLM \u2014 prompt engineering, fine-tuning, and knowledge distillation \u2014 using feedback from the brand\u2019s customer service agents. We find that the usability of a model\u2019s responses can make up for a large difference in inference cost for our case study brand, and we extrapolate our findings to the broader enterprise space.",
+    "authors": [
+      "Kristen Howell",
+      "Gwen Christian",
+      "Pavel Fomitchov",
+      "Gitit Kehat",
+      "Julianne Marzulla",
+      "Leanne Rolston",
+      "Jadin Tredup",
+      "Ilana Zimmerman",
+      "Ethan Selfridge",
+      "Joseph Bradley"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.24",
+    "point2d": [
+      16.08854866027832,
+      47.21501922607422
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1192,
+    "title": "Application-Agnostic Language Modeling for On-Device ASR",
+    "abstract": "On-device automatic speech recognition systems face several challenges compared to server-based systems. They have to meet stricter constraints in terms of speed, disk size and memory while maintaining the same accuracy. Often they have to serve several ap- plications with different distributions at once, such as communicating with a virtual assistant and speech-to-text. The simplest solution to serve multiple applications is to build application-specific (language) models, but this leads to an increase in memory. Therefore, we explore different data- and architecture-driven language modeling approaches to build a single application-agnostic model. We propose two novel feed-forward architectures that find an optimal trade off between different on-device constraints. In comparison to the application-specific solution, one of our novel approaches reduces the disk size by half, while maintaining speed and accuracy of the original model.",
+    "authors": [
+      "Markus Nussbaum-thom",
+      "Lyan Verwimp",
+      "Youssef Oualil"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.25",
+    "point2d": [
+      -62.26696014404297,
+      16.20006561279297
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1193,
+    "title": "Building Accurate Low Latency ASR for Streaming Voice Search in E-commerce",
+    "abstract": "Automatic Speech Recognition (ASR) is essential for any voice-based application. The streaming capability of ASR becomes necessary to provide immediate feedback to the user in applications like Voice Search. LSTM/RNN and CTC based ASR systems are very simple to train and deploy for low latency streaming applications but have lower accuracy when compared to the state-of-the-art models. In this work, we build accurate LSTM, attention and CTC based streaming ASR models for large-scale Hinglish (blend of Hindi and English) Voice Search. We evaluate how various modifications in vanilla LSTM training improve the system\u2019s accuracy while preserving the streaming capabilities. We also discuss a simple integration of end-of-speech (EOS) detection with CTC models, which helps reduce the overall search latency. Our model achieves a word error rate (WER) of 3.69% without EOS and 4.78% with EOS, with ~1300 ms (~46.64%) reduction in latency.",
+    "authors": [
+      "Abhinav Goyal",
+      "Nikesh Garera"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.26",
+    "point2d": [
+      -62.98462677001953,
+      17.106624603271484
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1194,
+    "title": "PLAtE: A Large-scale Dataset for List Page Web Extraction",
+    "abstract": "Recently, neural models have been leveraged to significantly improve the performance of information extraction from semi-structured websites. However, a barrier for continued progress is the small number of datasets large enough to train these models. In this work, we introduce the PLAtE (Pages of Lists Attribute Extraction) benchmark dataset as a challenging new web extraction task. PLAtE focuses on shopping data, specifically extractions from product review pages with multiple items encompassing the tasks of: (1) finding product list segmentation boundaries and (2) extracting attributes for each product. PLAtE is composed of 52,898 items collected from 6,694 pages and 156,014 attributes, making it the first large-scale list page web extraction dataset. We use a multi-stage approach to collect and annotate the dataset and adapt three state-of-the-art web extraction models to the two tasks comparing their strengths and weaknesses both quantitatively and qualitatively.",
+    "authors": [
+      "Aidan San",
+      "Yuan Zhuang",
+      "Jan Bakus",
+      "Colin Lockard",
+      "David Ciemiewicz",
+      "Sandeep Atluri",
+      "Kevin Small",
+      "Yangfeng Ji",
+      "Heba Elfardy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.27",
+    "point2d": [
+      28.610506057739258,
+      -58.600486755371094
+    ],
+    "cluster": 38.0
+  },
+  {
+    "idx": 1195,
+    "title": "Rapid Diffusion: Building Domain-Specific Text-to-Image Synthesizers with Fast Inference Speed",
+    "abstract": "Text-to-Image Synthesis (TIS) aims to generate images based on textual inputs. Recently, several large pre-trained diffusion models have been released to create high-quality images with pre-trained text encoders and diffusion-based image synthesizers. However, popular diffusion-based models from the open-source community cannot support industrial domain-specific applications due to the lack of entity knowledge and low inference speed.In this paper, we propose Rapid Diffusion, a novel framework for training and deploying super-resolution, text-to-image latent diffusion models with rich entity knowledge injected and optimized networks.Furthermore, we employ BladeDISC, an end-to-end Artificial Intelligence (AI) compiler, and FlashAttention techniques to optimize computational graphs of the generated models for online deployment. Experiments verify the effectiveness of our approach in terms of image quality and inference speed. In addition, we present industrial use cases and integrate Rapid Diffusion to an AI platform to show its practical values.",
+    "authors": [
+      "Bingyan Liu",
+      "Weifeng Lin",
+      "Zhongjie Duan",
+      "Chengyu Wang",
+      "Wu Ziheng",
+      "Zhang Zipeng",
+      "Kui Jia",
+      "Lianwen Jin",
+      "Cen Chen",
+      "Jun Huang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.28",
+    "point2d": [
+      -64.65679168701172,
+      52.258121490478516
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1196,
+    "title": "Large Scale Generative Multimodal Attribute Extraction for E-commerce Attributes",
+    "abstract": "E-commerce websites (e.g. Amazon, Alibaba) have a plethora of structured and unstructured information (text and images) present on the product pages. Sellers often don\u2019t label or mislabel values of the attributes (e.g. color, size etc.) for their products. Automatically identifying these attribute values from an eCommerce product page that contains both text and images is a challenging task, especially when the attribute value is not explicitly mentioned in the catalog. In this paper, we present a scalable solution for this problem where we pose attribute extraction problem as a question-answering task, which we solve using MXT, that consists of three key components: (i) MAG (Multimodal Adaptation Gate), (ii) Xception network, and (iii) T5 encoder-decoder. Our system consists of a generative model that generates attribute-values for a given product by using both textual and visual characteristics (e.g. images) of the product. We show that our system is capable of handling zero-shot attribute prediction (when attribute value is not seen in training data) and value-absent prediction (when attribute value is not mentioned in the text) which are missing in traditional classification-based and NER-based models respectively. We have trained our models using distant supervision, removing dependency on human labeling, thus making them practical for real-world applications. With this framework, we are able to train a single model for 1000s of (product-type, attribute) pairs, thus reducing the overhead of training and maintaining separate models. Extensive experiments on two real world datasets (total 57 attributes) show that our framework improves the absolute recall@90P by 10.16% and 6.9 from the existing state of the art models. In a popular e-commerce store, we have productionized our models that cater to 12K (product-type, attribute) pairs, and have extracted 150MM attribute values.",
+    "authors": [
+      "Anant Khandelwal",
+      "Happy Mittal",
+      "Shreyas Kulkarni",
+      "Deepak Gupta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.29",
+    "point2d": [
+      26.551082611083984,
+      -61.78910827636719
+    ],
+    "cluster": 13.0
+  },
+  {
+    "idx": 1197,
+    "title": "Consistent Text Categorization using Data Augmentation in e-Commerce",
+    "abstract": "The categorization of massive e-Commerce data is a crucial, well-studied task, which is prevalent in industrial settings. In this work, we aim to improve an existing product categorization model that is already in use by a major web company, serving multiple applications.At its core, the product categorization model is a text classification model that takes a product title as an input and outputs the most suitable category out of thousands of available candidates. Upon a closer inspection, we found inconsistencies in the labeling of similar items. For example, minor modifications of the product title pertaining to colors or measurements majorly impacted the model\u2019s output. This phenomenon can negatively affect downstream recommendation or search applications, leading to a sub-optimal user experience.To address this issue, we propose a new framework for consistent text categorization. Our goal is to improve the model\u2019s consistency while maintaining its production-level performance. We use a semi-supervised approach for data augmentation and presents two different methods for utilizing unlabeled samples. One method relies directly on existing catalogs, while the other uses a generative model. We compare the pros and cons of each approach and present our experimental results.",
+    "authors": [
+      "Noa Avigdor",
+      "Guy Horowitz",
+      "Ariel Raviv",
+      "Stav Yanovsky Daye"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.30",
+    "point2d": [
+      23.494651794433594,
+      -59.751216888427734
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1198,
+    "title": "An efficient method for Natural Language Querying on Structured Data",
+    "abstract": "We present an efficient and reliable approach to Natural Language Querying (NLQ) on databases (DB) which is not based on text-to-SQL type semantic parsing. Our approach simplifies the NLQ on structured data problem to the following \u201cbread and butter\u201d NLP tasks: (a) Domain classification, for choosing which DB table to query, whether the question is out-of-scope (b) Multi-head slot/entity extraction (SE) to extract the field criteria and other attributes such as its role (filter, sort etc) from the raw text and (c) Slot value disambiguation (SVD) to resolve/normalize raw spans from SE to format suitable to query a DB. This is a general purpose, DB language agnostic approach and the output can be used to query any DB and return results to the user. Also each of these tasks is extremely well studied, mature, easier to collect data for and enables better error analysis by tracing problems to specific components when something goes wrong.",
+    "authors": [
+      "Hanoz Bhathena",
+      "Aviral Joshi",
+      "Prateek Singh"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.31",
+    "point2d": [
+      79.05143737792969,
+      3.734755516052246
+    ],
+    "cluster": 0.0
+  },
+  {
+    "idx": 1199,
+    "title": "Boosting Transformers and Language Models for Clinical Prediction in Immunotherapy",
+    "abstract": "Clinical prediction is an essential task in the healthcare industry. However, the recent success of transformers, on which large language models are built, has not been extended to this domain. In this research, we explore the use of transformers and language models in prognostic prediction for immunotherapy using real-world patients\u2019 clinical data and molecular profiles. This paper investigates the potential of transformers to improve clinical prediction compared to conventional machine learning approaches and addresses the challenge of few-shot learning in predicting rare disease areas. The study benchmarks the efficacy of baselines and language models on prognostic prediction across multiple cancer types and investigates the impact of different pretrained language models under few-shot regimes. The results demonstrate significant improvements in accuracy and highlight the potential of NLP in clinical research to improve early detection and intervention for different diseases.",
+    "authors": [
+      "Zekai Chen",
+      "Mariann Micsinai Balan",
+      "Kevin Brown"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.32",
+    "point2d": [
+      30.724206924438477,
+      -37.01797866821289
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1200,
+    "title": "EvolveMT: an Ensemble MT Engine Improving Itself with Usage Only",
+    "abstract": "This work proposes a method named EvolveMT for the efficient combination of multiple machine translation (MT) engines. The method selects the output from one engine for each segment, using online learning techniques to predict the most appropriate system for each translation request. A neural quality estimation metric supervises the method without requiring reference translations. The method\u2019s online learning capability enables it to adapt to changes in the domain or MT engines dynamically, eliminating the requirement for retraining. The method selects a subset of translation engines to be called based on the source sentence features. The degree of exploration is configurable according to the desired quality-cost trade-off. Results from custom datasets demonstrate that EvolveMT achieves similar translation accuracy at a lower cost than selecting the best translation of each segment from all translations using an MT quality estimator. To the best of our knowledge, EvolveMT is the first MT system that adapts itself after deployment to incoming translation requests from the production environment without needing costly retraining on human feedback.",
+    "authors": [
+      "Kamer Y\u00fcksel",
+      "Ahmet Gunduz",
+      "Mohamed Al-badrashiny",
+      "Hassan Sawaf"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.33",
+    "point2d": [
+      -70.01835632324219,
+      -5.89885139465332
+    ],
+    "cluster": 1.0
+  },
+  {
+    "idx": 1201,
+    "title": "A Static Evaluation of Code Completion by Large Language Models",
+    "abstract": "Large language models trained on code have shown great potential to increase productivity of software developers. Several execution-based benchmarks have been proposed to evaluate functional correctness of model-generated code on simple programming problems. Nevertheless, it is expensive to perform the same evaluation on complex real-world projects considering the execution cost. On the other hand, static analysis tools such as linters, which can detect errors without running the program, haven\u2019t been well explored for evaluating code generation models. In this work, we propose a static evaluation framework to quantify static errors in Python code completions, by leveraging Abstract Syntax Trees. Compared with execution-based evaluation, our method is not only more efficient, but also applicable to code in the wild. For experiments, we collect code context from open source repos to generate one million function bodies using public models. Our static analysis reveals that Undefined Name and Unused Variable are the most common errors among others made by language models.Through extensive studies, we also show the impact of sampling temperature, model size, and context on static errors in code completions.",
+    "authors": [
+      "Hantian Ding",
+      "Varun Kumar",
+      "Yuchen Tian",
+      "Zijian Wang",
+      "Rob Kwiatkowski",
+      "Xiaopeng Li",
+      "Murali Krishna Ramanathan",
+      "Baishakhi Ray",
+      "Parminder Bhatia",
+      "Sudipta Sengupta"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.34",
+    "point2d": [
+      -8.82791805267334,
+      -55.027015686035156
+    ],
+    "cluster": 11.0
+  },
+  {
+    "idx": 1202,
+    "title": "Scalable and Safe Remediation of Defective Actions in Self-Learning Conversational Systems",
+    "abstract": "Off-Policy reinforcement learning has been the driving force for the state-of-the-art conversational AIs leading to more natural human-agent interactions and improving the user satisfaction for goal-oriented agents. However, in large-scale commercial settings, it is often challenging to balance between policy improvements and experience continuity on the broad spectrum of applications handled by such system. In the literature, off-policy evaluation and guard-railing on aggregate statistics has been commonly used to address this problem. In this paper, we propose method for curating and leveraging high-precision samples sourced from historical regression incident reports to validate, safe-guard, and improve policies prior to the online deployment. We conducted extensive experiments using data from a real-world conversational system and actual regression incidents. The proposed method is currently deployed in our production system to protect customers against broken experiences and enable long-term policy improvements.",
+    "authors": [
+      "Sarthak Ahuja",
+      "Mohammad Kachuee",
+      "Fatemeh Sheikholeslami",
+      "Weiqing Liu",
+      "Jaeyoung Do"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.35",
+    "point2d": [
+      19.761634826660156,
+      78.04741668701172
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1203,
+    "title": "MobileNMT: Enabling Translation in 15MB and 30ms",
+    "abstract": "Deploying NMT models on mobile devices is essential for privacy, low latency, and offline scenarios. For high model capacity, NMT models are rather large. Running these models on devices is challenging with limited storage, memory, computation, and power consumption. Existing work either only focuses on a single metric such as FLOPs or general engine which is not good at auto-regressive decoding. In this paper, we present MobileNMT, a system that can translate in 15MB and 30ms on devices. We propose a series of principles for model compression when combined with quantization. Further, we implement an engine that is friendly to INT8 and decoding. With the co-design of model and engine, compared with the existing system, we speed up 47.0x and save 99.5% of memory with only 11.6% loss of BLEU. Our code will be publicly available after the anonymity period.",
+    "authors": [
+      "Ye Lin",
+      "Xiaohui Wang",
+      "Zhexi Zhang",
+      "Mingxuan Wang",
+      "Tong Xiao",
+      "Jingbo Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.36",
+    "point2d": [
+      -50.22500991821289,
+      -12.335553169250488
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1204,
+    "title": "Multi-doc Hybrid Summarization via Salient Representation Learning",
+    "abstract": "Multi-document summarization is gaining more and more attention recently and serves as an invaluable tool to obtain key facts among a large information pool. In this paper, we proposed a multi-document hybrid summarization approach, which simultaneously generates a human-readable summary and extracts corresponding key evidences based on multi-doc inputs. To fulfill that purpose, we crafted a salient representation learning method to induce latent salient features, which are effective for joint evidence extraction and summary generation. In order to train this model, we conducted multi-task learning to optimize a composited loss, constructed over extractive and abstractive sub-components in a hierarchical way. We implemented the system based on a ubiquiotously adopted transformer architecture and conducted experimental studies on multiple datasets across two domains, achieving superior performance over the baselines.",
+    "authors": [
+      "Min Xiao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.37",
+    "point2d": [
+      -8.214997291564941,
+      41.75984573364258
+    ],
+    "cluster": 7.0
+  },
+  {
+    "idx": 1205,
+    "title": "SaFER: A Robust and Efficient Framework for Fine-tuning BERT-based Classifier with Noisy Labels",
+    "abstract": "Learning on noisy datasets is a challenging problem when pre-trained language models are applied to real-world text classification tasks. In numerous industrial applications, acquiring task-specific datasets with 100% accurate labels is difficult, thus many datasets are accompanied by label noise at different levels. Previous work has shown that existing noise-handling methods could not improve the peak performance of BERT on noisy datasets, and might even deteriorate it. In this paper, we propose SaFER, a robust and efficient fine-tuning framework for BERT-based text classifiers, combating label noises without access to any clean data for training or validation. Utilizing a label-agnostic early-stopping strategy and self-supervised learning, our proposed framework achieves superior performance in terms of both accuracy and speed on multiple text classification benchmarks. The trained model is finally fully deployed in several industrial biomedical literature mining tasks and demonstrates high effectiveness and efficiency.",
+    "authors": [
+      "Zhenting Qi",
+      "Xiaoyu Tan",
+      "Chao Qu",
+      "Yinghui Xu",
+      "Yuan Qi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.38",
+    "point2d": [
+      -3.6662464141845703,
+      -20.09052848815918
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1206,
+    "title": "Chemical Language Understanding Benchmark",
+    "abstract": "In this paper, we introduce the benchmark datasets named CLUB (Chemical Language Understanding Benchmark) to facilitate NLP research in the chemical industry. We have 4 datasets consisted of text and token classification tasks. As far as we have recognized, it is one of the first examples of chemical language understanding benchmark datasets consisted of tasks for both patent and literature articles provided by industrial organization. All the datasets are internally made by chemists from scratch. Finally, we evaluate the datasets on the various language models based on BERT and RoBERTa, and demonstrate the model performs better when the domain of the pretrained models are closer to chemistry domain. We provide baselines for our benchmark as 0.8054 in average, and we hope this benchmark is used by many researchers in both industry and academia.",
+    "authors": [
+      "Yunsoo Kim",
+      "Hyuk Ko",
+      "Jane Lee",
+      "Hyun Young Heo",
+      "Jinyoung Yang",
+      "Sungsoo Lee",
+      "Kyu-hwang Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.39",
+    "point2d": [
+      27.687196731567383,
+      -25.403772354125977
+    ],
+    "cluster": 6.0
+  },
+  {
+    "idx": 1207,
+    "title": "HyperT5: Towards Compute-Efficient Korean Language Modeling",
+    "abstract": "Pretraining and fine-tuning language models have become the standard practice in industrial natural language processing (NLP), but developing and deploying general-purpose language models without the abundant computation or data resources is a real-world issue faced by smaller organizations or communities whose main focus is languages with less accessible resources (e.g., non-English). This paper explores the sequence-to-sequence (seq2seq) language model architecture as a more practical and compute-efficient alternative to the decoder-oriented approach (e.g., GPT-3), accompanied by novel findings in compute-optimality analyses. We successfully trained billion-scale Korean-language seq2seq language models that strongly outperform other competitive models in Korean benchmarks. Moreover, we demonstrate that such language models can be more efficiently utilized by employing a heavy pre-finetuning strategy, by showcasing a case study on dialog-task adaptation. Our case study shows that adopting language models with more readily available domain-specific unlabeled data greatly improves fine-tuning data efficiency in low-resource settings.",
+    "authors": [
+      "Dongju Park",
+      "Soonwon Ka",
+      "Kang Min Yoo",
+      "Gichang Lee",
+      "Jaewook Kang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.40",
+    "point2d": [
+      -34.303428649902344,
+      -19.61782455444336
+    ],
+    "cluster": 8.0
+  },
+  {
+    "idx": 1208,
+    "title": "Semantic Ambiguity Detection in Sentence Classification using Task-Specific Embeddings",
+    "abstract": "Ambiguity is a major obstacle to providing services based on sentence classification. However, because of the structural limitations of the service, there may not be sufficient contextual information to resolve the ambiguity. In this situation, we focus on ambiguity detection so that service design considering ambiguity is possible. We utilize similarity in a semantic space to detect ambiguity in service scenarios and training data. In addition, we apply task-specific embedding to improve performance. Our results demonstrate that ambiguities and resulting labeling errors in training data or scenarios can be detected. Additionally, we confirm that it can be used to debug services",
+    "authors": [
+      "Jong Myoung Kim",
+      "Young-jun Lee",
+      "Sangkeun Jung",
+      "Ho-jin Choi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.41",
+    "point2d": [
+      42.55147171020508,
+      7.126704692840576
+    ],
+    "cluster": 9.0
+  },
+  {
+    "idx": 1209,
+    "title": "Reliable and Interpretable Drift Detection in Streams of Short Texts",
+    "abstract": "Data drift is the change in model input data that is one of the key factors leading to machine learning models performance degradation over time. Monitoring drift helps detecting these issues and preventing their harmful consequences. Meaningful drift interpretation is a fundamental step towards effective re-training of the model. In this study we propose an end-to-end framework for reliable model-agnostic change-point detection and interpretation in large task-oriented dialog systems, proven effective in multiple customer deployments. We evaluate our approach and demonstrate its benefits with a novel variant of intent classification training dataset, simulating customer requests to a dialog system. We make the data publicly available.",
+    "authors": [
+      "Ella Rabinovich",
+      "Matan Vetzler",
+      "Samuel Ackerman",
+      "Ateret Anaby Tavor"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.42",
+    "point2d": [
+      4.485043048858643,
+      23.995237350463867
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1210,
+    "title": "Sharing Encoder Representations across Languages, Domains and Tasks in Large-Scale Spoken Language Understanding",
+    "abstract": "Leveraging representations from pre-trained transformer-based encoders achieves state-of-the-art performance on numerous NLP tasks.Larger encoders can improve accuracy for spoken language understanding (SLU) but are challenging to use given the inference latency constraints of online systems (especially on CPU machines).We evaluate using a larger 170M parameter BERT encoder that shares representations across languages, domains and tasks for SLU compared to using smaller 17M parameter BERT encoders with language-, domain- and task-decoupled finetuning.Running inference with a larger shared encoder on GPU is latency neutral and reduces infrastructure cost compared to running inference for decoupled smaller encoders on CPU machines.The larger shared encoder reduces semantic error rates by 4.62% for test sets representing user requests to voice-controlled devices and 5.79% on the tail of the test sets on average across four languages.",
+    "authors": [
+      "Jonathan Hueser",
+      "Judith Gaspers",
+      "Thomas Gueudre",
+      "Chandana Prakash",
+      "Jin Cao",
+      "Daniil Sorokin",
+      "Quynh Do",
+      "Nicolas Anastassacos",
+      "Tobias Falke",
+      "Turan Gojayev"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.43",
+    "point2d": [
+      -52.812164306640625,
+      16.64987564086914
+    ],
+    "cluster": 30.0
+  },
+  {
+    "idx": 1211,
+    "title": "Annotating Research Infrastructure in Scientific Papers: An NLP-driven Approach",
+    "abstract": "In this work, we present a natural language processing (NLP) pipeline for the identification, extraction and linking of Research Infrastructure (RI) used in scientific publications. Links between scientific equipment and publications where the equipment was used can support multiple use cases, such as evaluating the impact of RI investment, and supporting Open Science and research reproducibility. These links can also be used to establish a profile of the RI portfolio of each institution and associate each equipment with scientific output. The system we are describing here is already in production, and has been used to address real business use cases, some of which we discuss in this paper. The computational pipeline at the heart of the system comprises both supervised and unsupervised modules to detect the usage of research equipment by processing the full text of the articles. Additionally, we have created a knowledge graph of RI, which is utilized to annotate the articles with metadata. Finally, examples of the business value of the insights made possible by this NLP pipeline are illustrated.",
+    "authors": [
+      "Seyed Amin Tabatabaei",
+      "Georgios Cheirmpos",
+      "Marius Doornenbal",
+      "Alberto Zigoni",
+      "Veronique Moore",
+      "Georgios Tsatsaronis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.44",
+    "point2d": [
+      17.100833892822266,
+      14.569324493408203
+    ],
+    "cluster": 40.0
+  },
+  {
+    "idx": 1212,
+    "title": "Event-Centric Query Expansion in Web Search",
+    "abstract": "In search engines, query expansion (QE) is a crucial technique to improve search experience. Previous studies often rely on long-term search log mining, which leads to slow updates and is sub-optimal for time-sensitive news searches. In this work, we present Event-Centric Query Expansion (EQE), the QE system used in a famous Chinese search engine. EQE utilizes a novel event retrieval framework that consists of four stages, i.e., event collection, event reformulation, semantic retrieval and online ranking, which can select the best expansion from a significant amount of potential events rapidly and accurately. Specifically, we first collect and filter news headlines from websites. Then we propose a generation model that incorporates contrastive learning and prompt-tuning techniques to reformulate these headlines to concise candidates. Additionally, we fine-tune a dual-tower semantic model to serve as an encoder for event retrieval and explore a two-stage contrastive training approach to enhance the accuracy of event retrieval. Finally, we rank the retrieved events and select the optimal one as QE, which is then used to improve the retrieval of event-related documents. Through offline analysis and online A/B testing, we observed that the EQE system has significantly improved many indicators compared to the baseline. The system has been deployed in a real production environment and serves hundreds of millions of users.",
+    "authors": [
+      "Yanan Zhang",
+      "Weijie Cui",
+      "Yangfan Zhang",
+      "Xiaoling Bai",
+      "Zhe Zhang",
+      "Jin Ma",
+      "Xiang Chen",
+      "Tianhua Zhou"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.45",
+    "point2d": [
+      16.964345932006836,
+      -18.550695419311523
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1213,
+    "title": "Transferable and Efficient: Unifying Dynamic Multi-Domain Product Categorization",
+    "abstract": "As e-commerce platforms develop different business lines, a special but challenging product categorization scenario emerges, where there are multiple domain-specific category taxonomies and each of them evolves dynamically over time. In order to unify the categorization process and ensure efficiency, we propose a two-stage taxonomy-agnostic framework that relies solely on calculating the semantic relatedness between product titles and category names in the vector space. To further enhance domain transferability and better exploit cross-domain data, we design two plug-in modules: a heuristic mapping scorer and a pretrained contrastive ranking module with the help of meta concepts, which represent keyword knowledge shared across domains.Comprehensive offline experiments show that our method outperforms strong baselineson three dynamic multi-domain product categorization (DMPC) tasks,and online experiments reconfirm its efficacy with a5% increase on seasonal purchase revenue. Related datasets will be released.",
+    "authors": [
+      "Shansan Gong",
+      "Zelin Zhou",
+      "Shuo Wang",
+      "Fengjiao Chen",
+      "Xiujie Song",
+      "Xuezhi Cao",
+      "Yunsen Xian",
+      "Kenny Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.46",
+    "point2d": [
+      23.846403121948242,
+      -60.97172927856445
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1214,
+    "title": "DISCOSQA: A Knowledge Base Question Answering System for Space Debris based on Program Induction",
+    "abstract": "Space program agencies execute complex satellite operations that need to be supported by the technical knowledge contained in their extensive information systems.Knowledge Base (KB) databases are an effective way of storing and accessing such information to scale.In this work we present a system, developed for the European Space Agency, that can answer complex natural language queries, to support engineers in accessing the information contained in a KB that models the orbital space debris environment. Our system is based on a pipeline which first generates a program sketch from a natural language question, then specializes the sketch into a concrete query program with mentions of entities, attributes and relations, and finally executes the program against the database.This pipeline decomposition approach enables us to train the system by leveraging out-of-domain data and semi-synthetic data generated by GPT-3, thus reducing overfitting and shortcut learning even with limited amount of in-domain training data.",
+    "authors": [
+      "Paul Darm",
+      "Antonio Valerio Miceli Barone",
+      "Shay B. Cohen",
+      "Annalisa Riccardi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.47",
+    "point2d": [
+      71.67832946777344,
+      -5.789941310882568
+    ],
+    "cluster": 0.0
+  },
+  {
+    "idx": 1215,
+    "title": "BADGE: Speeding Up BERT Inference after Deployment via Block-wise Bypasses and Divergence-based Early Exiting",
+    "abstract": "Early exiting can reduce the average latency of pre-trained language models (PLMs) via its adaptive inference mechanism and work with other inference speed-up methods like model pruning, thus drawing much attention from the industry. In this work, we propose a novel framework, BADGE, which consists of two off-the-shelf methods for improving PLMs\u2019 early exiting. We first address the issues of training a multi-exit PLM, the backbone model for early exiting. We propose the novel architecture of block-wise bypasses, which can alleviate the conflicts in jointly training multiple intermediate classifiers and thus improve the overall performances of multi-exit PLM while introducing negligible additional flops to the model. Second, we propose a novel divergence-based early exiting (DGE) mechanism, which obtains early exiting signals by comparing the predicted distributions of two adjacent layers\u2019 exits. Extensive experiments on three proprietary datasets and three GLUE benchmark tasks demonstrate that our method can obtain a better speedup-performance trade-off than the existing baseline methods.\\footnote{Code will be made publicly available to the research community upon acceptance.}",
+    "authors": [
+      "Wei Zhu",
+      "Peng Wang",
+      "Yuan Ni",
+      "Guotong Xie",
+      "Xiaoling Wang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.48",
+    "point2d": [
+      -40.869075775146484,
+      -22.31410789489746
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1216,
+    "title": "K-pop and fake facts: from texts to smart alerting for maritime security",
+    "abstract": "Maritime security requires full-time monitoring of the situation, mainly based on technical data (radar, AIS) but also from OSINT-like inputs (e.g., newspapers). Some threats to the operational reliability of this maritime surveillance, such as malicious actors, introduce discrepancies between hard and soft data (sensors and texts), either by tweaking their AIS emitters or by emitting false information on pseudo-newspapers.Many techniques exist to identify these pieces of false information, including using knowledge base population techniques to build a structured view of the information. This paper presents a use case for suspect data identification in a maritime setting. The proposed system UMBAR ingests data from sensors and texts, processing them through an information extraction step, in order to feed a Knowledge Base and finally perform coherence checks between the extracted facts.",
+    "authors": [
+      "Maxime Prieur",
+      "Souhir Gahbiche",
+      "Guillaume Gadek",
+      "Sylvain Gatepaille",
+      "Kilian Vasnier",
+      "Valerian Justine"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.49",
+    "point2d": [
+      66.69711303710938,
+      -10.96755313873291
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1217,
+    "title": "Evaluating Embedding APIs for Information Retrieval",
+    "abstract": "The ever-increasing size of language models curtails their widespread access to the community, thereby galvanizing many companies and startups into offering access to large language models through APIs. One particular API, suitable for dense retrieval, is the semantic embedding API that builds vector representations of a given text. With a growing number of APIs at our disposal, in this paper, our goal is to analyze semantic embedding APIs in realistic retrieval scenarios in order to assist practitioners and researchers in finding suitable services according to their needs. Specifically, we wish to investigate the capabilities of existing APIs on domain generalization and multilingual retrieval. For this purpose, we evaluate the embedding APIs on two standard benchmarks, BEIR, and MIRACL. We find that re-ranking BM25 results using the APIs is a budget-friendly approach and is most effective on English, in contrast to the standard practice, i.e., employing them as first-stage retrievers. For non-English retrieval, re-ranking still improves the results, but a hybrid model with BM25 works best albeit at a higher cost. We hope our work lays the groundwork for thoroughly evaluating APIs that are critical in search and more broadly, in information retrieval.",
+    "authors": [
+      "Ehsan Kamalloo",
+      "Xinyu Zhang",
+      "Odunayo Ogundepo",
+      "Nandan Thakur",
+      "David Alfonso-hermelo",
+      "Mehdi Rezagholizadeh",
+      "Jimmy Lin"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.50",
+    "point2d": [
+      12.636736869812012,
+      -17.40672492980957
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1218,
+    "title": "Domain-Agnostic Neural Architecture for Class Incremental Continual Learning in Document Processing Platform",
+    "abstract": "Production deployments in complex systems require ML architectures to be highly efficient and usable against multiple tasks. Particularly demanding are classification problems in which data arrives in a streaming fashion and each class is presented separately. Recent methods with stochastic gradient learning have been shown to struggle in such setups or have limitations like memory buffers, and being restricted to specific domains that disable its usage in real-world scenarios. For this reason, we present a fully differentiable architecture based on the Mixture of Experts model, that enables the training of high-performance classifiers when examples from each class are presented separately. We conducted exhaustive experiments that proved its applicability in various domains and ability to learn online in production environments. The proposed technique achieves SOTA results without a memory buffer and clearly outperforms the reference methods.",
+    "authors": [
+      "Mateusz W\u00f3jcik",
+      "Witold Ko\u015bciukiewicz",
+      "Mateusz Baran",
+      "Tomasz Kajdanowicz",
+      "Adam Gonczarek"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.51",
+    "point2d": [
+      -27.04215431213379,
+      -13.169522285461426
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1219,
+    "title": "Regression-Free Model Updates for Spoken Language Understanding",
+    "abstract": "In real-world systems, an important requirement for model updates is to avoid regressions in user experience caused by flips of previously correct classifications to incorrect ones. Multiple techniques for that have been proposed in the recent literature. In this paper, we apply one such technique, focal distillation, to model updates in a goal-oriented dialog system and assess its usefulness in practice. In particular, we evaluate its effectiveness for key language understanding tasks, including sentence classification and sequence labeling tasks, we further assess its effect when applied to repeated model updates over time, and test its compatibility with mislabeled data. Our experiments on a public benchmark and data from a deployed dialog system demonstrate that focal distillation can substantially reduce regressions, at only minor drops in accuracy, and that it further outperforms naive supervised training in challenging mislabeled data and label expansion settings.",
+    "authors": [
+      "Andrea Caciolai",
+      "Verena Weber",
+      "Tobias Falke",
+      "Alessandro Pedrani",
+      "Davide Bernardi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.52",
+    "point2d": [
+      2.979152202606201,
+      25.83684539794922
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1220,
+    "title": "Reducing cohort bias in natural language understanding systems with targeted self-training scheme",
+    "abstract": "Bias in machine learning models can be an issue when the models are trained on particular types of data that do not generalize well, causing under performance in certain groups of users. In this work, we focus on reducing the bias related to new customers in a digital voice assistant system. It is observed that natural language understanding models often have lower performance when dealing with requests coming from new users rather than experienced users. To mitigate this problem, we propose a framework that consists of two phases (1) a fixing phase with four active learning strategies used to identify important samples coming from new users, and (2) a self training phase where a teacher model trained from the first phase is used to annotate semi-supervised samples to expand the training data with relevant cohort utterances. We explain practical strategies that involve an identification of representative cohort-based samples through density clustering as well as employing implicit customer feedbacks to improve new customers\u2019 experience. We demonstrate the effectiveness of our approach in a real world large scale voice assistant system for two languages, German and French through both offline experiments as well as A/B testings.",
+    "authors": [
+      "Dieu-thu Le",
+      "Gabriela Hernandez",
+      "Bei Chen",
+      "Melanie Bradford"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.53",
+    "point2d": [
+      4.285247802734375,
+      25.583995819091797
+    ],
+    "cluster": 2.0
+  },
+  {
+    "idx": 1221,
+    "title": "Content Moderation for Evolving Policies using Binary Question Answering",
+    "abstract": "Content moderation on social media is governed by policies that are intricate and frequently updated with evolving world events. However, automated content moderation systems often restrict easy adaptation to policy changes and are expected to learn policy intricacies from limited amounts of labeled data, which make effective policy compliance challenging. We propose to model content moderation as a binary question answering problem where the questions validate the loosely coupled themes constituting a policy. A decision logic is applied on top to aggregate the theme-specific validations. This way the questions pass theme information to a transformer network as explicit policy prompts, that in turn enables explainability. This setting further allows for faster adaptation to policy updates by leveraging zero-shot capabilities of pre-trained transformers. We showcase improved recall for our proposed method at 95\\% precision on two proprietary datasets of social media posts and comments respectively annotated under curated Hate Speech and Commercial Spam policies.",
+    "authors": [
+      "Sankha Subhra Mullick",
+      "Mohan Bhambhani",
+      "Suhit Sinha",
+      "Akshat Mathur",
+      "Somya Gupta",
+      "Jidnya Shah"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.54",
+    "point2d": [
+      14.543072700500488,
+      38.44400405883789
+    ],
+    "cluster": 19.0
+  },
+  {
+    "idx": 1222,
+    "title": "Weighted Contrastive Learning With False Negative Control to Help Long-tailed Product Classification",
+    "abstract": "Item categorization (IC) aims to classify product descriptions into leaf nodes in a categorical taxonomy, which is a key technology used in a wide range of applications. Along with the fact that most datasets often has a long-tailed distribution, classification performances on tail labels tend to be poor due to scarce supervision, causing many issues in real-life applications. To address IC task\u2019s long-tail issue, K-positive contrastive loss (KCL) is proposed on image classification task and can be applied on the IC task when using text-based contrastive learning, e.g., SimCSE. However, one shortcoming of using KCL has been neglected in previous research: false negative (FN) instances may harm the KCL\u2019s representation learning. To address the FN issue in the KCL, we proposed to re-weight the positive pairs in the KCL loss with a regularization that the sum of weights should be constrained to K+1 as close as possible. After controlling FN instances with the proposed method, IC performance has been further improved and is superior to other LT-addressing methods.",
+    "authors": [
+      "Tianqi Wang",
+      "Lei Chen",
+      "Xiaodan Zhu",
+      "Younghun Lee",
+      "Jing Gao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.55",
+    "point2d": [
+      21.85955238342285,
+      -59.41510772705078
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1223,
+    "title": "Towards Building a Robust Toxicity Predictor",
+    "abstract": "Recent NLP literature pays little attention to the robustness of toxicity language predictors, while these systems are most likely to be used in adversarial contexts. This paper presents a novel adversarial attack, \\texttt{ToxicTrap}, introducing small word-level perturbations to fool SOTA text classifiers to predict toxic text samples as benign. \\texttt{ToxicTrap} exploits greedy based search strategies to enable fast and effective generation of toxic adversarial examples. Two novel goal function designs allow \\texttt{ToxicTrap} to identify weaknesses in both multiclass and multilabel toxic language detectors. Our empirical results show that SOTA toxicity text classifiers are indeed vulnerable to the proposed attacks, attaining over 98\\% attack success rates in multilabel cases. We also show how a vanilla adversarial training and its improved version can help increase robustness of a toxicity detector even against unseen attacks.",
+    "authors": [
+      "Dmitriy Bespalov",
+      "Sourav Bhabesh",
+      "Yi Xiang",
+      "Liutong Zhou",
+      "Yanjun Qi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.56",
+    "point2d": [
+      6.671026229858398,
+      13.00820541381836
+    ],
+    "cluster": 15.0
+  },
+  {
+    "idx": 1224,
+    "title": "AI Coach Assist: An Automated Approach for Call Recommendation in Contact Centers for Agent Coaching",
+    "abstract": "In recent years, the utilization of Artificial Intelligence (AI) in the contact center industry is on the rise. One area where AI can have a significant impact is in the coaching of contact center agents. By analyzing call transcripts, AI can quickly determine which calls are most relevant for coaching purposes, and provide relevant feedback and insights to the contact center manager or supervisor. In this paper, we present \u201cAI Coach Assis\u201d, which leverages the pre-trained transformer-based language models to determine whether a given call is coachable or not based on the quality assurance (QA) queries/questions asked by the contact center managers or supervisors. The system was trained and evaluated on a large dataset collected from real-world contact centers and provides an efficient and effective way to determine which calls are most relevant for coaching purposes. Extensive experimental evaluation demonstrates the potential of AI Coach Assist to improve the coaching process, resulting in enhancing the performance of contact center agents.",
+    "authors": [
+      "Md Tahmid Rahman Laskar",
+      "Cheng Chen",
+      "Xue-yong Fu",
+      "Mahsa Azizi",
+      "Shashi Bhushan",
+      "Simon Corston-oliver"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.57",
+    "point2d": [
+      32.316253662109375,
+      63.06163787841797
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 1225,
+    "title": "Unified Contextual Query Rewriting",
+    "abstract": "Query rewriting (QR) is an important technique for user friction (i.e. recovering ASR error or system error) reduction and contextual carryover (i.e. ellipsis and co-reference) in conversational AI systems. Recently, generation-based QR models have achieved promising results on these two tasks separately. Although these two tasks have many similarities such as they both use the previous dialogue along with the current request as model input, there is no unified model to solve them jointly. To this end, we propose a unified contextual query rewriting model that unifies QR for both reducing friction and contextual carryover purpose. Moreover, we involve multiple auxiliary tasks such as trigger prediction and NLU interpretation tasks to boost the performance of the rewrite. We leverage the text-to-text unified framework which uses independent tasks with weighted loss to account for task importance. Then we propose new unified multitask learning strategies including a sequential model which outputs one sentence for multi-tasks, and a hybrid model where some tasks are independent and some tasks are sequentially generated. Our experimental results demonstrate the effectiveness of the proposed unified learning methods.",
+    "authors": [
+      "Yingxue Zhou",
+      "Jie Hao",
+      "Mukund Rungta",
+      "Yang Liu",
+      "Eunah Cho",
+      "Xing Fan",
+      "Yanbin Lu",
+      "Vishal Vasudevan",
+      "Kellen Gillespie",
+      "Zeynab Raeesy"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.58",
+    "point2d": [
+      63.2313346862793,
+      25.26576805114746
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1226,
+    "title": "Context-Aware Query Rewriting for Improving Users\u2019 Search Experience on E-commerce Websites",
+    "abstract": "E-commerce queries are often short and ambiguous. Consequently, query understanding often uses query rewriting to disambiguate user-input queries. While using e-commerce search tools, users tend to enter multiple searches, which we call context, before purchasing. These history searches contain contextual insights about users\u2019 true shopping intents. Therefore, modeling such contextual information is critical to a better query rewriting model. However, existing query rewriting models ignore users\u2019 history behaviors and consider only the instant search query, which is often a short string offering limited information about the true shopping intent.We propose an end-to-end context-aware query rewriting model to bridge this gap, which takes the search context into account. Specifically, our model builds a session graph using the history search queries and their contained words. We then employ a graph attention mechanism that models cross-query relations and computes contextual information of the session. The model subsequently calculates session representations by combining the contextual information with the instant search query using an aggregation network. The session representations are then decoded to generate rewritten queries. Empirically, we demonstrate the superiority of our method to state-of-the-art approaches under various metrics.",
+    "authors": [
+      "Simiao Zuo",
+      "Qingyu Yin",
+      "Haoming Jiang",
+      "Shaohui Xi",
+      "Bing Yin",
+      "Chao Zhang",
+      "Tuo Zhao"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.59",
+    "point2d": [
+      65.1242446899414,
+      23.051225662231445
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1227,
+    "title": "Federated Learning of Gboard Language Models with Differential Privacy",
+    "abstract": "We train and deploy language models (LMs) with federated learning (FL) and differential privacy (DP) in Google Keyboard (Gboard). The recent DP-Follow the Regularized Leader (DP-FTRL) algorithm is applied to achieve meaningfully formal DP guarantees without requiring uniform sampling of clients. To provide favorable privacy-utility trade-offs, we introduce a new client participation criterion and discuss the implication of its configuration in large scale systems. We show how quantile-based clip estimation can be combined with DP-FTRL to adaptively choose the clip norm during training or reduce the hyperparameter tuning in preparation of training. With the help of pretraining on public data, we trained and deployed more than fifteen Gboard LMs that achieve high utility and $\\rho-$zCDP privacy guarantees with $\\rho \\in (0.3, 2)$, with one model additionally trained with secure aggregation.We summarize our experience and provide concrete suggestions on DP training for practitioners.",
+    "authors": [
+      "Zheng Xu",
+      "Yanxiang Zhang",
+      "Galen Andrew",
+      "Christopher Choquette",
+      "Peter Kairouz",
+      "Brendan Mcmahan",
+      "Jesse Rosenstock",
+      "Yuanbo Zhang"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.60",
+    "point2d": [
+      -1.2191225290298462,
+      13.867131233215332
+    ],
+    "cluster": 48.0
+  },
+  {
+    "idx": 1228,
+    "title": "RadLing: Towards Efficient Radiology Report Understanding",
+    "abstract": "Most natural language tasks in the radiology domain use language models pre-trained on biomedical corpus. There are few pretrained language models trained specifically for radiology, and fewer still that have been trained in a low data setting and gone on to produce comparable results in fine-tuning tasks. We present RadLing, a continuously pretrained language model using ELECTRA-small architecture, trained using over 500K radiology reports that can compete with state-of-the-art results for fine tuning tasks in radiology domain. Our main contribution in this paper is knowledge-aware masking which is an taxonomic knowledge-assisted pre-training task that dynamically masks tokens to inject knowledge during pretraining. In addition, we also introduce an knowledge base-aided vocabulary extension to adapt the general tokenization vocabulary to radiology domain.",
+    "authors": [
+      "Rikhiya Ghosh",
+      "Oladimeji Farri",
+      "Sanjeev Kumar Karn",
+      "Manuela Danu",
+      "Ramya Vunikili",
+      "Larisa Micu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.61",
+    "point2d": [
+      32.13959503173828,
+      -34.5948371887207
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1229,
+    "title": "Predicting Customer Satisfaction with Soft Labels for Ordinal Classification",
+    "abstract": "In a typical call center, only up to 8% of callersleave a Customer Satisfaction (CSAT) surveyresponse at the end of the call, and these tend tobe customers with strongly positive or negativeexperiences. To manage this data sparsity andresponse bias, we outline a predictive CSATdeep learning algorithm that infers CSAT onthe 1-5 scale on inbound calls to the call centerwith minimal latency. The key metric to maximize is the precision for CSAT = 1 (lowestCSAT). We maximize this metric in two ways.First, reframing the problemas a binary class, rather than five-class problem during model fine-tuning, and then mapping binary outcomes back to five classes usingtemperature-scaled model probabilities. Second, using soft labels to represent the classes. Theresult is a production model able to support keycustomer workflows with high accuracy overmillions of calls a month.",
+    "authors": [
+      "Etienne Manderscheid",
+      "Matthias Lee"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.62",
+    "point2d": [
+      34.586544036865234,
+      60.523948669433594
+    ],
+    "cluster": 44.0
+  },
+  {
+    "idx": 1230,
+    "title": "Accurate Training of Web-based Question Answering Systems with Feedback from Ranked Users",
+    "abstract": "Recent work has shown that large-scale annotated datasets are essential for training state-of-the-art Question Answering (QA) models.Unfortunately, creating this data is expensive and requires a huge amount of annotation work. An alternative and cheaper source of supervision is given by feedback data collected from deployed QA systems.This data can be collected from tens of millions of user with no additional cost, for real-world QA services, e.g., Alexa, Google Home, and etc. The main drawback is the noise affecting feedback on individual examples. Recent literature on QA systems has shown the benefit of training models even with noisy feedback. However, these studies have multiple limitations: (i) they used uniform random noise to simulate feedback responses, which is typically an unrealistic approximation as noise follows specific patterns, depending on target examples and users; and (ii) they do not show how to aggregate feedback for improving training signals.In this paper, we first collect a large scale (16M) QA dataset with real feedback sampled from the QA traffic of a popular Virtual Assistant.Second, we use this data to develop two strategies for filtering unreliable users and thus de-noise feedback: (i) ranking users with an automatic classifier, and (ii) aggregating feedback over similar instances and comparing users between each other. Finally, we train QA models on our filtered feedback data, showing a significant improvement over the state of the art.",
+    "authors": [
+      "Liang Wang",
+      "Ivano Lauriola",
+      "Alessandro Moschitti"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.63",
+    "point2d": [
+      62.69215774536133,
+      15.31623649597168
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1231,
+    "title": "SPM: A Split-Parsing Method for Joint Multi-Intent Detection and Slot Filling",
+    "abstract": "In a task-oriented dialogue system, joint intent detection and slot filling for multi-intent utterances become meaningful since users tend to query more. The current state-of-the-art studies choose to process multi-intent utterances through a single joint model of sequence labelling and multi-label classification, which cannot generalize to utterances with more intents than training samples. Meanwhile, it lacks the ability to assign slots to each corresponding intent. To overcome these problems, we propose a Split-Parsing Method (SPM) for joint multiple intent detection and slot filling, which is a two-stage method. It first splits an input sentence into multiple sub-sentences which contain a single-intent, and then a joint single intent detection and slot filling model is applied to parse each sub-sentence recurrently. Finally, we integrate the parsed results. The sub-sentence split task is also treated as a sequence labelling problem with only one entity-label, which can effectively generalize to a sentence with more intents unseen in the training set. Experimental results on three multi-intent datasets show that our method obtains substantial improvements over different baselines.",
+    "authors": [
+      "Sheng Jiang",
+      "Su Zhu",
+      "Ruisheng Cao",
+      "Qingliang Miao",
+      "Kai Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.64",
+    "point2d": [
+      -7.8971076011657715,
+      73.15995788574219
+    ],
+    "cluster": 32.0
+  },
+  {
+    "idx": 1232,
+    "title": "NAG-NER: a Unified Non-Autoregressive Generation Framework for Various NER Tasks",
+    "abstract": "Recently, the recognition of flat, nested, and discontinuous entities by a unified generative model framework has received increasing attention both in the research field and industry. However, the current generative NER methods force the entities to be generated in a predefined order, suffering from error propagation and inefficient decoding. In this work, we propose a unified non-autoregressive generation (NAG) framework for general NER tasks, referred to as NAG-NER. First, we propose to generate entities as a set instead of a sequence, avoiding error propagation. Second, we propose incorporating NAG in NER tasks for efficient decoding by treating each entity as a target sequence. Third, to enhance the generation performances of the NAG decoder, we employ the NAG encoder to detect potential entity mentions. Extensive experiments show that our NAG-NER model outperforms the state-of-the-art generative NER models on three benchmark NER datasets of different types and two of our proprietary NER tasks.\\footnote{Code will be publicly available to the research community upon acceptance.}",
+    "authors": [
+      "Xinpeng Zhang",
+      "Ming Tan",
+      "Jingfan Zhang",
+      "Wei Zhu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.65",
+    "point2d": [
+      35.95410919189453,
+      -86.74127197265625
+    ],
+    "cluster": 14.0
+  },
+  {
+    "idx": 1233,
+    "title": "Search Query Spell Correction with Weak Supervision in E-commerce",
+    "abstract": "Misspelled search queries in e-commerce can lead to empty or irrelevant products. Besides inadvertent typing mistakes, most spell mistakes occur because the user does not know the correct spelling, hence typing it as it is pronounced colloquially. This colloquial typing creates countless misspelling patterns for a single correct query. In this paper, we first systematically analyze and group different spell errors into error classes and then leverage the state-of-the-art Transformer model for contextual spell correction. We overcome the constraint of limited human labelled data by proposing novel synthetic data generation techniques for voluminous generation of training pairs needed by data hungry Transformers, without any human intervention. We further utilize weakly supervised data coupled with curriculum learning strategies to improve on tough spell mistakes without regressing on the easier ones. We show significant improvements from our model on human labeled data and online A/B experiments against multiple state-of-art models.",
+    "authors": [
+      "Vishal Kakkar",
+      "Chinmay Sharma",
+      "Madhura Pande",
+      "Surender Kumar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.66",
+    "point2d": [
+      -7.278964042663574,
+      -13.134726524353027
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1234,
+    "title": "\u201cLet\u2019s not Quote out of Context\u201d: Unified Vision-Language Pretraining for Context Assisted Image Captioning",
+    "abstract": "Well-formed context aware image captions and tags in enterprise content such as marketing material are critical to ensure their brand presence and content recall. Manual creation and updates to ensure the same is non trivial given the scale and the tedium towards this task. We propose a new unified Vision-Language (VL) model based on the One For All (OFA) model, with a focus on context-assisted image captioning where the caption is generated based on both the image and its context. Our approach aims to overcome the context-independent (image and text are treated independently) nature of the existing approaches. We exploit context by pretraining our model with datasets of three tasks- news image captioning where the news article is the context, contextual visual entailment, and keyword extraction from the context. The second pretraining task is a new VL task, and we construct and release two datasets for the task with 1.1M and 2.2K data instances. Our system achieves state-of-the-art results with an improvement of up to 8.34 CIDEr score on the benchmark news image captioning datasets. To the best of our knowledge, ours is the first effort at incorporating contextual information in pretraining the models for the VL tasks.",
+    "authors": [
+      "Abisek Rajakumar Kalarani",
+      "Pushpak Bhattacharyya",
+      "Niyati Chhaya",
+      "Sumit Shekhar"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.67",
+    "point2d": [
+      -57.939090728759766,
+      44.575157165527344
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1235,
+    "title": "What, When, and How to Ground: Designing User Persona-Aware Conversational Agents for Engaging Dialogue",
+    "abstract": "This paper presents a method for building a personalized open-domain dialogue system to address the WWH (WHAT, WHEN, and HOW) problem for natural response generation in a commercial setting, where personalized dialogue responses are heavily interleaved with casual response turns. The proposed approach involves weighted dataset blending, negative persona information augmentation methods, and the design of personalized conversation datasets to address the challenges of WWH in personalized, open-domain dialogue systems. Our work effectively balances dialogue fluency and tendency to ground, while also introducing a response-type label to improve the controllability and explainability of the grounded responses. The combination of these methods leads to more fluent conversations, as evidenced by subjective human evaluations as well as objective evaluations.",
+    "authors": [
+      "Deuksin Kwon",
+      "Sunwoo Lee",
+      "Ki Hyun Kim",
+      "Seojin Lee",
+      "Taeyoon Kim",
+      "Eric Davis"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.68",
+    "point2d": [
+      14.597138404846191,
+      64.79898071289062
+    ],
+    "cluster": 49.0
+  },
+  {
+    "idx": 1236,
+    "title": "CUPID: Curriculum Learning Based Real-Time Prediction using Distillation",
+    "abstract": "Relevance in E-commerce Product Search is crucial for providing customers with accurate results that match their query intent. With recent advancements in NLP and Deep Learning, Transformers have become the default choice for relevance classification tasks. In such a setting, the relevance model uses query text and product title as input features, and estimates if the product is relevant for the customer query. While cross-attention in Transformers enables a more accurate relevance prediction in such a setting, its high evaluation latency makes it unsuitable for real-time predictions in which thousands of products must be evaluated against a user query within few milliseconds. To address this issue, we propose CUPID: a Curriculum learning based real-time Prediction using Distillation that utilizes knowledge distillation within a curriculum learning setting to learn a simpler architecture that can be evaluated within low latency budgets. In a bi-lingual relevance prediction task, our approach shows an 302 bps improvement on English and 676 bps improvement for low-resource Arabic, while maintaining the low evaluation latency on CPUs.",
+    "authors": [
+      "Arindam Bhattacharya",
+      "Ankith Ms",
+      "Ankit Gandhi",
+      "Vijay Huddar",
+      "Atul Saroop",
+      "Rahul Bhagat"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.69",
+    "point2d": [
+      9.803693771362305,
+      -9.07201099395752
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1237,
+    "title": "Answering Unanswered Questions through Semantic Reformulations in Spoken QA",
+    "abstract": "Spoken Question Answering (QA) is a key feature of voice assistants, usually backed by multiple QA systems. Users ask questions via spontaneous speech that can contain disfluencies, errors, and informal syntax or phrasing. This is a major challenge in QA, causing unanswered questions or irrelevant answers, leading to bad user experiences. We analyze failed QA requests to identify core challenges: lexical gaps, proposition types, complex syntactic structure, and high specificity. We propose a Semantic Question Reformulation (SURF) model offering three linguistically-grounded operations (repair, syntactic reshaping, generalization) to rewrite questions to facilitate answering. Offline evaluation on 1M unanswered questions from a leading voice assistant shows that SURF significantly improves answer rates: up to 24% of previously unanswered questions obtain relevant answers (75%). Live deployment shows positive impact for millions of customers with unanswered questions; explicit relevance feedback shows high user satisfaction.",
+    "authors": [
+      "Pedro Faustini",
+      "Zhiyu Chen",
+      "Besnik Fetahu",
+      "Oleg Rokhlenko",
+      "Shervin Malmasi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.70",
+    "point2d": [
+      66.74422454833984,
+      12.524670600891113
+    ],
+    "cluster": 5.0
+  },
+  {
+    "idx": 1238,
+    "title": "Exploring Zero and Few-shot Techniques for Intent Classification",
+    "abstract": "Conversational NLU providers often need to scale to thousands of intent-classification models where new customers often face the cold-start problem. Scaling to so many customers puts a constraint on storage space as well. In this paper, we explore four different zero and few-shot intent classification approaches with this low-resource constraint: 1) domain adaptation, 2) data augmentation, 3) zero-shot intent classification using descriptions large language models (LLMs), and 4) parameter-efficient fine-tuning of instruction-finetuned language models. Our results show that all these approaches are effective to different degrees in low-resource settings. Parameter-efficient fine-tuning using T-few recipe on Flan-T5 yields the best performance even with just one sample per intent. We also show that the zero-shot method of prompting LLMs using intent descriptions is also very competitive.",
+    "authors": [
+      "Soham Parikh",
+      "Mitul Tiwari",
+      "Prashil Tumbade",
+      "Quaizar Vohra"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.71",
+    "point2d": [
+      -13.213430404663086,
+      -7.726759910583496
+    ],
+    "cluster": 20.0
+  },
+  {
+    "idx": 1239,
+    "title": "Referring to Screen Texts with Voice Assistants",
+    "abstract": "Voice assistants help users make phone calls, send messages, create events, navigate and do a lot more. However assistants have limited capacity to understand their users\u2019 context. In this work, we aim to take a step in this direction. Our work dives into a new experience for users to refer to phone numbers, addresses, email addresses, urls, and dates on their phone screens. We focus on reference understanding, which is particularly interesting when, similar to visual grounding, there are multiple similar texts on screen. We collect a dataset and propose a lightweight general purpose model for this novel experience. Since consuming pixels directly is expensive, our system is designed to rely only on text extracted from the UI. Our model is modular, offering flexibility, better interpretability and efficient run time memory.",
+    "authors": [
+      "Shruti Bhargava",
+      "Anand Dhoot",
+      "Ing-marie Jonsson",
+      "Hoang Long Nguyen",
+      "Alkesh Patel",
+      "Hong Yu",
+      "Vincent Renkens"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.72",
+    "point2d": [
+      -20.354063034057617,
+      37.89155578613281
+    ],
+    "cluster": 29.0
+  },
+  {
+    "idx": 1240,
+    "title": "Generate-then-Retrieve: Intent-Aware FAQ Retrieval in Product Search",
+    "abstract": "Frequently Asked Question (FAQ) retrieval aims at retrieving question-answer pairs for a given a user query. Integrating FAQ retrieval with product search can not only empower users to make more informed purchase decisions, but also enhance user retention through efficient post-purchase support. Providing FAQ content without disrupting user\u2019s shopping experience poses challenges on deciding when and how to show FAQ results. Our proposed intent-aware FAQ retrieval consists of (1) an intent classifier that predicts whether the query is looking for an FAQ; (2) a reformulation model that rewrites query into a natural question. Offline evaluation demonstrates that our approach improves 12% in Hit@1 on retrieving ground-truth FAQs, while reducing latency by 95% compared to baseline systems. These improvements are further validated by real user feedback, where more than 99% of users consider FAQs displayed on top of product search results is helpful. Overall, our findings show promising directions for integrating FAQ retrieval into product search at scale.",
+    "authors": [
+      "Zhiyu Chen",
+      "Jason Choi",
+      "Besnik Fetahu",
+      "Oleg Rokhlenko",
+      "Shervin Malmasi"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.73",
+    "point2d": [
+      65.84791564941406,
+      21.770044326782227
+    ],
+    "cluster": 18.0
+  },
+  {
+    "idx": 1241,
+    "title": "KAFA: Rethinking Image Ad Understanding with Knowledge-Augmented Feature Adaptation of Vision-Language Models",
+    "abstract": "Image ad understanding is a crucial task with wide real-world applications. Although highly challenging with the involvement of diverse atypical scenes, real-world entities, and reasoning over scene-texts, how to interpret image ads is relatively under-explored, especially in the era of foundational vision-language models (VLMs) featuring impressive generalizability and adaptability. In this paper, we perform the first empirical study of image ad understanding through the lens of pre-trained VLMs. We benchmark and reveal practical challenges in adapting these VLMs to image ad understanding. We propose a simple feature adaptation strategy to effectively fuse multimodal information for image ads and further empower it with knowledge of real-world entities. We hope our study draws more attention to image ad understanding which is broadly relevant to the advertising industry.",
+    "authors": [
+      "Zhiwei Jia",
+      "Pradyumna Narayana",
+      "Arjun Akula",
+      "Garima Pruthi",
+      "Hao Su",
+      "Sugato Basu",
+      "Varun Jampani"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.74",
+    "point2d": [
+      -57.981693267822266,
+      43.225425720214844
+    ],
+    "cluster": 43.0
+  },
+  {
+    "idx": 1242,
+    "title": "Weakly supervised hierarchical multi-task classification of customer questions",
+    "abstract": "Identifying granular and actionable topics from customer questions (CQ) posted on e-commerce websites helps surface the missing information expected by customers on the product detail page (DP), provide insights to brands and sellers on what critical product information that the customers are looking before making a purchase decision and helps enrich the catalog quality to improve the overall customer experience (CX). We propose a weakly supervised Hierarchical Multi-task Classification Framework (HMCF) to identify topics from customer questions at various granularities. Complexity lies in creating a list of granular topics (taxonomy) for 1000s of product categories and building a scalable classification system. To this end, we introduce a clustering based Taxonomy Creation and Data Labeling (TCDL) module for creating taxonomy and labelled data with minimal supervision. Using TCDL module, taxonomy and labelled data creation task reduces to 2 hours as compared to 2 weeks of manual efforts by a subject matter expert. For classification, we propose a two level HMCF that performs multi-class classification to identify coarse level-1 topic and leverages NLI based label-aware approach to identify granular level-2 topic. We showcase that HMCF (based on BERT and NLI) a) achieves absolute improvement of 13% in Top-1 accuracy over single-task non-hierarchical baselines b) learns a generic domain invariant function that can adapt to constantly evolving taxonomy (open label set) without need of re-training. c) reduces model deployment efforts significantly since it needs only one model that caters to 1000s of product categories.",
+    "authors": [
+      "Jitenkumar Rana",
+      "Promod Yenigalla",
+      "Chetan Aggarwal",
+      "Sandeep Sricharan Mukku",
+      "Manan Soni",
+      "Rashmi Patange"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.75",
+    "point2d": [
+      22.981061935424805,
+      -60.887306213378906
+    ],
+    "cluster": 17.0
+  },
+  {
+    "idx": 1243,
+    "title": "Automated Digitization of Unstructured Medical Prescriptions",
+    "abstract": "Automated digitization of prescription images is a critical prerequisite to scale digital healthcare services such as online pharmacies. This is challenging in emerging markets since prescriptions are not digitized at source and patients lack the medical expertise to interpret prescriptions to place orders. In this paper, we present prescription digitization system for online medicine ordering built with minimal supervision. Our system uses a modular pipeline comprising a mix of ML and rule-based components for (a) image to text extraction, (b) segmentation into blocks and medication items, (c) medication attribute extraction, (d) matching against medicine catalog, and (e) shopping cart building. Our approach efficiently utilizes multiple signals like layout, medical ontologies, and semantic embeddings via LayoutLMv2 model to yield substantial improvement relative to strong baselines on medication attribute extraction. Our pipeline achieves +5.9% gain in precision@3 and +5.6% in recall@3 over catalog-based fuzzy matching baseline for shopping cart building for printed prescriptions.",
+    "authors": [
+      "Megha Sharma",
+      "Tushar Vatsal",
+      "Srujana Merugu",
+      "Aruna Rajan"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "industry",
+    "doi": "10.18653/v1/2023.acl-industry.76",
+    "point2d": [
+      30.07203483581543,
+      -43.19080352783203
+    ],
+    "cluster": 42.0
+  },
+  {
+    "idx": 1244,
+    "title": "Goal Awareness for Conversational AI: Proactivity, Non-collaborativity, and Beyond",
+    "abstract": "Conversational systems are envisioned to provide social support or functional service to human users via natural language interactions. Conventional conversation researches mainly focus on the responseability of the system, such as dialogue context understanding and response generation, but overlooks the design of an essential property in intelligent conversations, i.e., goal awareness. The awareness of goals means the state of not only being responsive to the users but also aware of the target conversational goal and capable of leading the conversation towards the goal, which is a significant step towards higher-level intelligence and artificial consciousness. It can not only largely improve user engagement and service efficiency in the conversation, but also empower the system to handle more complicated conversation tasks that involve strategical and motivational interactions. In this tutorial, we will introduce the recent advances on the design of agent\u2019s awareness of goals in a wide range of conversational systems.",
+    "authors": [
+      "Yang Deng",
+      "Wenqiang Lei",
+      "Minlie Huang",
+      "Tat-Seng Chua"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.1",
+    "point2d": [
+      22.777347564697266,
+      65.49446868896484
+    ],
+    "cluster": 33.0
+  },
+  {
+    "idx": 1245,
+    "title": "Complex Reasoning in Natural Language",
+    "abstract": "Teaching machines to reason over texts has been a long-standing goal of natural language processing (NLP). To this end, researchers have designed a diverse set of complex reasoning tasks that involve compositional reasoning, knowledge retrieval, grounding, commonsense reasoning, etc. A standard choice for building systems that perform a desired type of reasoning is to fine-tune a pretrained language model (LM) on specific downstream tasks. However, recent research has demonstrated that such a straightforward approach is often brittle. For example, Elazar et al. (2021) and Branco et al. (2021) show that, on question-answering (QA) tasks, similar performance can be achieved with questions removed from the inputs. Min et al. (2019), Chen and Durrett (2019), and Tang et al. (2021) show that models trained on multi-hop QA do not generalize to answer single-hop questions. The reasoning capabilities of these models thus remain at a surface level, i.e., exploiting data patterns. Consequently, augmenting LMs with techniques that make them robust and effective becomes an active research area. We will start the tutorial by providing an overview of complex reasoning tasks where the standard application of pretrained language models fails. This tutorial then reviews recent promising directions for tackling these tasks. Specifically, we focus on the following groups of approaches that explicitly consider problem structures: (1) knowledge-augmented methods, where the knowledge is either incorporated during fine-tuning or pretraining; (2) few-shot prompting methods, which effectively guide the models to follow instructions; (3) neuro-symbolic methods, which produce explicit intermediate representations; and, (4) rationale-based methods, one of the most popular forms of the neuro-symbolic methods, which highlight subsets of input as explanations for individual model predictions.",
+    "authors": [
+      "Wenting Zhao",
+      "Mor Geva",
+      "Bill Yuchen Lin",
+      "Michihiro Yasunaga",
+      "Aman Madaan",
+      "Tao Yu"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.2",
+    "point2d": [
+      47.18156433105469,
+      -13.447463989257812
+    ],
+    "cluster": 36.0
+  },
+  {
+    "idx": 1246,
+    "title": "Everything you need to know about Multilingual LLMs: Towards fair, performant and reliable models for languages of the world",
+    "abstract": "This tutorial will describe various aspects of scaling up language technologies to many of the world\u2019s languages by describing the latest research in Massively Multilingual Language Models (MMLMs). We will cover topics such as data collection, training and fine-tuning of models, Responsible AI issues such as fairness, bias and toxicity, linguistic diversity and evaluation in the context of MMLMs, specifically focusing on issues in non-English and low-resource languages. Further, we will also talk about some of the real-world challenges in deploying these models in language communities in the field. With the performance of MMLMs improving in the zero-shot setting for many languages, it is now becoming feasible to use them for building language technologies in many languages of the world, and this tutorial will provide the computational linguistics community with unique insights from the latest research in multilingual models.",
+    "authors": [
+      "Sunayana Sitaram",
+      "Monojit Choudhury",
+      "Barun Patra",
+      "Vishrav Chaudhary",
+      "Kabir Ahuja",
+      "Kalika Bali"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.3",
+    "point2d": [
+      -52.81568908691406,
+      -10.14548110961914
+    ],
+    "cluster": 46.0
+  },
+  {
+    "idx": 1247,
+    "title": "Generating Text from Language Models",
+    "abstract": "An increasingly large percentage of natural language processing (NLP) tasks center around the generation of text from probabilistic language models. Despite this trend, techniques for improving or specifying preferences in these generated texts rely mostly on intuition-based heuristics. Further, there lacks a unified presentation of their motivations, practical implementation, successes and pitfalls. Practitioners must, therefore, choose somewhat blindly between generation algorithms\u2014like top-p sampling or beam search\u2014which can lead to wildly different results. At the same time, language generation research continues to criticize and improve the standard toolboxes, further adding entropy to the state of the field. In this tutorial, we will provide a centralized and cohesive discussion of critical considerations when choosing how to generate from a language model. We will cover a wide range of empirically-observed problems (like degradation, hallucination, repetition) and their corresponding proposed algorithmic solutions from recent research (like top-p sampling and its successors). We will then discuss a subset of these algorithms under a unified light; most stochastic generation strategies can be framed as locally adapting the probabilities of a model to avoid failure cases. Finally, we will then cover methods in controlled generation, that go beyond just ensuring coherence to ensure text exhibits specific desired properties. We aim for NLP practitioners and researchers to leave our tutorial with a unified framework which they can use to evaluate and contribute to the latest research in language generation.",
+    "authors": [
+      "Afra Amini",
+      "Ryan Cotterell",
+      "John Hewitt",
+      "Clara Meister",
+      "Tiago Pimentel"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.4",
+    "point2d": [
+      -26.916095733642578,
+      15.548568725585938
+    ],
+    "cluster": 4.0
+  },
+  {
+    "idx": 1248,
+    "title": "Indirectly Supervised Natural Language Processing",
+    "abstract": "This tutorial targets researchers and practitioners who are interested in ML technologies for NLP from indirect supervision. In particular, we will present a diverse thread of indirect supervision studies that try to answer the following questions: (i) when and how can we provide supervision for a target task T, if all we have is data that corresponds to a \u201crelated\u201d task T\u2032? (ii) humans do not use exhaustive supervision; they rely on occasional feedback, and learn from incidental signals from various sources; how can we effectively incorporate such supervision in machine learning? (iii) how can we leverage multi-modal supervision to help NLP? To the end, we will discuss several lines of research that address those challenges, including (i) indirect supervision from T \u2032 that handles T with outputs spanning from a moderate size to an open space, (ii) the use of sparsely occurring and incidental signals, such as partial labels, noisy labels, knowledge-based constraints, and cross-domain or cross-task annotations\u2014all having statistical associations with the task, (iii) principled ways to measure and understand why these incidental signals can contribute to our target tasks, and (iv) indirect supervision from vision-language signals. We will conclude the tutorial by outlining directions for further investigation.",
+    "authors": [
+      "Wenpeng Yin",
+      "Muhao Chen",
+      "Ben Zhou",
+      "Qiang Ning",
+      "Kai-Wei Chang",
+      "Dan Roth"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.5",
+    "point2d": [
+      -10.183514595031738,
+      -10.186142921447754
+    ],
+    "cluster": 3.0
+  },
+  {
+    "idx": 1249,
+    "title": "Retrieval-based Language Models and Applications",
+    "abstract": "Retrieval-based language models (LMs) have shown impressive performance on diverse NLP tasks. In this tutorial, we will provide a comprehensive and coherent overview of recent advances in retrieval-based LMs. We will start by providing preliminaries covering the foundation of LMs (e.g., masked LMs, autoregressive LMs) and retrieval systems (e.g., nearest-neighbor search). We will then detail recent progress in retrieval-based models, focusing on their model architectures and learning approaches. Finally, we will show how retrieval-based LMs are adapted to downstream applications, and extended to multilingual and multi-modal settings. Finally, we will use an exercise to showcase the effectiveness of retrieval-based LMs.",
+    "authors": [
+      "Akari Asai",
+      "Sewon Min",
+      "Zexuan Zhong",
+      "Danqi Chen"
+    ],
+    "year": 2023,
+    "source": "acl",
+    "publication_type": "tutorials",
+    "doi": "10.18653/v1/2023.acl-tutorials.6",
+    "point2d": [
+      11.165570259094238,
+      -17.568254470825195
+    ],
+    "cluster": 20.0
+  }
+]
\ No newline at end of file