Spaces:
Running
Running
Init
Browse files- .gitignore +3 -0
- __pycache__/gpt_text.cpython-38.pyc +0 -0
- app.py +34 -0
- gpt_text.py +535 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
env/
|
2 |
+
output.pdf
|
3 |
+
paper.pdf
|
__pycache__/gpt_text.cpython-38.pyc
ADDED
Binary file (40.9 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gpt_text import init_val
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from wordcloud import WordCloud, STOPWORDS
|
8 |
+
|
9 |
+
|
10 |
+
# Program to read from text field and plot wordcloud in streamlit app
|
11 |
+
|
12 |
+
st.title("WordCloud Generator")
|
13 |
+
st.markdown("People who are new to a field can use this to get a quick overview of the field. It can also be used to get a quick summary of keywords in a research paper/article/books.")
|
14 |
+
st.markdown("WordCloud generator creates a image of words from the text you enter. The size of the word depends on the importance of the word in the text.")
|
15 |
+
|
16 |
+
st.markdown("Huggingface is a free platform to share the apps like this. This helps developers like us to reach more people.")
|
17 |
+
|
18 |
+
st.subheader("Play with the WordCloud Generator")
|
19 |
+
st.markdown("_Example given below is based on the GPT-1 research paper [1]_")
|
20 |
+
text = st.text_area("Enter text from the Article", placeholder="Type Here ..", height=250, value=init_val)
|
21 |
+
if st.button("Generate WordCloud"):
|
22 |
+
if text == "Type Here ..":
|
23 |
+
st.warning("Please Enter Text")
|
24 |
+
else:
|
25 |
+
wordcloud = WordCloud(min_word_length=3, background_color='White').generate(text)
|
26 |
+
fig, ax = plt.subplots()
|
27 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
28 |
+
ax.axis("off")
|
29 |
+
st.pyplot(fig)
|
30 |
+
|
31 |
+
st.subheader("Reference")
|
32 |
+
st.markdown("1. [Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (n.d.). (GPT-1) Improving Language Understanding by Generative Pre-Training.](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)")
|
33 |
+
st.markdown("2. [Wordcloud library](https://pypi.org/project/wordcloud/)")
|
34 |
+
|
gpt_text.py
ADDED
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
init_val = """
|
2 |
+
Improving Language Understanding
|
3 |
+
by Generative Pre-Training
|
4 |
+
Alec Radford
|
5 |
+
OpenAI
|
6 |
+
alec@openai.com
|
7 |
+
Karthik Narasimhan
|
8 |
+
OpenAI
|
9 |
+
karthikn@openai.com
|
10 |
+
Tim Salimans
|
11 |
+
OpenAI
|
12 |
+
tim@openai.com
|
13 |
+
Ilya Sutskever
|
14 |
+
OpenAI
|
15 |
+
ilyasu@openai.com
|
16 |
+
Abstract
|
17 |
+
Natural language understanding comprises a wide range of diverse tasks such
|
18 |
+
as textual entailment, question answering, semantic similarity assessment, and
|
19 |
+
document classification. Although large unlabeled text corpora are abundant,
|
20 |
+
labeled data for learning these specific tasks is scarce, making it challenging for
|
21 |
+
discriminatively trained models to perform adequately. We demonstrate that large
|
22 |
+
gains on these tasks can be realized by generative pre-training of a language model
|
23 |
+
on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
|
24 |
+
specific task. In contrast to previous approaches, we make use of task-aware input
|
25 |
+
transformations during fine-tuning to achieve effective transfer while requiring
|
26 |
+
minimal changes to the model architecture. We demonstrate the effectiveness of
|
27 |
+
our approach on a wide range of benchmarks for natural language understanding.
|
28 |
+
Our general task-agnostic model outperforms discriminatively trained models that
|
29 |
+
use architectures specifically crafted for each task, significantly improving upon the
|
30 |
+
state of the art in 9 out of the 12 tasks studied. For instance, we achieve absolute
|
31 |
+
improvements of 8.9% on commonsense reasoning (Stories Cloze Test), 5.7% on
|
32 |
+
question answering (RACE), and 1.5% on textual entailment (MultiNLI).
|
33 |
+
1 Introduction
|
34 |
+
The ability to learn effectively from raw text is crucial to alleviating the dependence on supervised
|
35 |
+
learning in natural language processing (NLP). Most deep learning methods require substantial
|
36 |
+
amounts of manually labeled data, which restricts their applicability in many domains that suffer
|
37 |
+
from a dearth of annotated resources [61]. In these situations, models that can leverage linguistic
|
38 |
+
information from unlabeled data provide a valuable alternative to gathering more annotation, which
|
39 |
+
can be time-consuming and expensive. Further, even in cases where considerable supervision
|
40 |
+
is available, learning good representations in an unsupervised fashion can provide a significant
|
41 |
+
performance boost. The most compelling evidence for this so far has been the extensive use of pretrained word embeddings [10, 39, 42] to improve performance on a range of NLP tasks [8, 11, 26, 45].
|
42 |
+
Leveraging more than word-level information from unlabeled text, however, is challenging for two
|
43 |
+
main reasons. First, it is unclear what type of optimization objectives are most effective at learning
|
44 |
+
text representations that are useful for transfer. Recent research has looked at various objectives
|
45 |
+
such as language modeling [44], machine translation [38], and discourse coherence [22], with each
|
46 |
+
method outperforming the others on different tasks.1 Second, there is no consensus on the most
|
47 |
+
effective way to transfer these learned representations to the target task. Existing techniques involve
|
48 |
+
a combination of making task-specific changes to the model architecture [43, 44], using intricate
|
49 |
+
learning schemes [21] and adding auxiliary learning objectives [50]. These uncertainties have made
|
50 |
+
it difficult to develop effective semi-supervised learning approaches for language processing.
|
51 |
+
1
|
52 |
+
https://gluebenchmark.com/leaderboard
|
53 |
+
Preprint. Work in progress.
|
54 |
+
In this paper, we explore a semi-supervised approach for language understanding tasks using a
|
55 |
+
combination of unsupervised pre-training and supervised fine-tuning. Our goal is to learn a universal
|
56 |
+
representation that transfers with little adaptation to a wide range of tasks. We assume access to
|
57 |
+
a large corpus of unlabeled text and several datasets with manually annotated training examples
|
58 |
+
(target tasks). Our setup does not require these target tasks to be in the same domain as the unlabeled
|
59 |
+
corpus. We employ a two-stage training procedure. First, we use a language modeling objective on
|
60 |
+
the unlabeled data to learn the initial parameters of a neural network model. Subsequently, we adapt
|
61 |
+
these parameters to a target task using the corresponding supervised objective.
|
62 |
+
For our model architecture, we use the Transformer [62], which has been shown to perform strongly on
|
63 |
+
various tasks such as machine translation [62], document generation [34], and syntactic parsing [29].
|
64 |
+
This model choice provides us with a more structured memory for handling long-term dependencies in
|
65 |
+
text, compared to alternatives like recurrent networks, resulting in robust transfer performance across
|
66 |
+
diverse tasks. During transfer, we utilize task-specific input adaptations derived from traversal-style
|
67 |
+
approaches [52], which process structured text input as a single contiguous sequence of tokens. As
|
68 |
+
we demonstrate in our experiments, these adaptations enable us to fine-tune effectively with minimal
|
69 |
+
changes to the architecture of the pre-trained model.
|
70 |
+
We evaluate our approach on four types of language understanding tasks β natural language inference,
|
71 |
+
question answering, semantic similarity, and text classification. Our general task-agnostic model
|
72 |
+
outperforms discriminatively trained models that employ architectures specifically crafted for each
|
73 |
+
task, significantly improving upon the state of the art in 9 out of the 12 tasks studied. For instance,
|
74 |
+
we achieve absolute improvements of 8.9% on commonsense reasoning (Stories Cloze Test) [40],
|
75 |
+
5.7% on question answering (RACE) [30], 1.5% on textual entailment (MultiNLI) [66] and 5.5% on
|
76 |
+
the recently introduced GLUE multi-task benchmark [64]. We also analyzed zero-shot behaviors
|
77 |
+
of the pre-trained model on four different settings and demonstrate that it acquires useful linguistic
|
78 |
+
knowledge for downstream tasks.
|
79 |
+
2 Related Work
|
80 |
+
Semi-supervised learning for NLP Our work broadly falls under the category of semi-supervised
|
81 |
+
learning for natural language. This paradigm has attracted significant interest, with applications to
|
82 |
+
tasks like sequence labeling [24, 33, 57] or text classification [41, 70]. The earliest approaches used
|
83 |
+
unlabeled data to compute word-level or phrase-level statistics, which were then used as features in a
|
84 |
+
supervised model [33]. Over the last few years, researchers have demonstrated the benefits of using
|
85 |
+
word embeddings [11, 39, 42], which are trained on unlabeled corpora, to improve performance on a
|
86 |
+
variety of tasks [8, 11, 26, 45]. These approaches, however, mainly transfer word-level information,
|
87 |
+
whereas we aim to capture higher-level semantics.
|
88 |
+
Recent approaches have investigated learning and utilizing more than word-level semantics from
|
89 |
+
unlabeled data. Phrase-level or sentence-level embeddings, which can be trained using an unlabeled
|
90 |
+
corpus, have been used to encode text into suitable vector representations for various target tasks [28,
|
91 |
+
32, 1, 36, 22, 12, 56, 31].
|
92 |
+
Unsupervised pre-training Unsupervised pre-training is a special case of semi-supervised learning
|
93 |
+
where the goal is to find a good initialization point instead of modifying the supervised learning
|
94 |
+
objective. Early works explored the use of the technique in image classification [20, 49, 63] and
|
95 |
+
regression tasks [3]. Subsequent research [15] demonstrated that pre-training acts as a regularization
|
96 |
+
scheme, enabling better generalization in deep neural networks. In recent work, the method has
|
97 |
+
been used to help train deep neural networks on various tasks like image classification [69], speech
|
98 |
+
recognition [68], entity disambiguation [17] and machine translation [48].
|
99 |
+
The closest line of work to ours involves pre-training a neural network using a language modeling
|
100 |
+
objective and then fine-tuning it on a target task with supervision. Dai et al. [13] and Howard and
|
101 |
+
Ruder [21] follow this method to improve text classification. However, although the pre-training
|
102 |
+
phase helps capture some linguistic information, their usage of LSTM models restricts their prediction
|
103 |
+
ability to a short range. In contrast, our choice of transformer networks allows us to capture longerrange linguistic structure, as demonstrated in our experiments. Further, we also demonstrate the
|
104 |
+
effectiveness of our model on a wider range of tasks including natural language inference, paraphrase
|
105 |
+
detection and story completion. Other approaches [43, 44, 38] use hidden representations from a
|
106 |
+
2
|
107 |
+
pre-trained language or machine translation model as auxiliary features while training a supervised
|
108 |
+
model on the target task. This involves a substantial amount of new parameters for each separate
|
109 |
+
target task, whereas we require minimal changes to our model architecture during transfer.
|
110 |
+
Auxiliary training objectives Adding auxiliary unsupervised training objectives is an alternative
|
111 |
+
form of semi-supervised learning. Early work by Collobert and Weston [10] used a wide variety of
|
112 |
+
auxiliary NLP tasks such as POS tagging, chunking, named entity recognition, and language modeling
|
113 |
+
to improve semantic role labeling. More recently, Rei [50] added an auxiliary language modeling
|
114 |
+
objective to their target task objective and demonstrated performance gains on sequence labeling
|
115 |
+
tasks. Our experiments also use an auxiliary objective, but as we show, unsupervised pre-training
|
116 |
+
already learns several linguistic aspects relevant to target tasks.
|
117 |
+
3 Framework
|
118 |
+
Our training procedure consists of two stages. The first stage is learning a high-capacity language
|
119 |
+
model on a large corpus of text. This is followed by a fine-tuning stage, where we adapt the model to
|
120 |
+
a discriminative task with labeled data.
|
121 |
+
3.1 Unsupervised pre-training
|
122 |
+
Given an unsupervised corpus of tokens U = {u1, . . . , un}, we use a standard language modeling
|
123 |
+
objective to maximize the following likelihood:
|
124 |
+
L1(U) = X
|
125 |
+
i
|
126 |
+
log P(ui
|
127 |
+
|uiβk, . . . , uiβ1; Ξ) (1)
|
128 |
+
where k is the size of the context window, and the conditional probability P is modeled using a neural
|
129 |
+
network with parameters Ξ. These parameters are trained using stochastic gradient descent [51].
|
130 |
+
In our experiments, we use a multi-layer Transformer decoder [34] for the language model, which is
|
131 |
+
a variant of the transformer [62]. This model applies a multi-headed self-attention operation over the
|
132 |
+
input context tokens followed by position-wise feedforward layers to produce an output distribution
|
133 |
+
over target tokens:
|
134 |
+
h0 = UWe + Wp
|
135 |
+
hl = transformer_block(hlβ1)βi β [1, n]
|
136 |
+
P(u) = softmax(hnWT
|
137 |
+
e
|
138 |
+
)
|
139 |
+
(2)
|
140 |
+
where U = (uβk, . . . , uβ1) is the context vector of tokens, n is the number of layers, We is the token
|
141 |
+
embedding matrix, and Wp is the position embedding matrix.
|
142 |
+
3.2 Supervised fine-tuning
|
143 |
+
After training the model with the objective in Eq. 1, we adapt the parameters to the supervised target
|
144 |
+
task. We assume a labeled dataset C, where each instance consists of a sequence of input tokens,
|
145 |
+
x
|
146 |
+
1
|
147 |
+
, . . . , xm, along with a label y. The inputs are passed through our pre-trained model to obtain
|
148 |
+
the final transformer blockβs activation h
|
149 |
+
m
|
150 |
+
l
|
151 |
+
, which is then fed into an added linear output layer with
|
152 |
+
parameters Wy to predict y:
|
153 |
+
P(y|x
|
154 |
+
1
|
155 |
+
, . . . , xm) = softmax(h
|
156 |
+
m
|
157 |
+
l Wy). (3)
|
158 |
+
This gives us the following objective to maximize:
|
159 |
+
L2(C) = X
|
160 |
+
(x,y)
|
161 |
+
log P(y|x
|
162 |
+
1
|
163 |
+
, . . . , xm). (4)
|
164 |
+
We additionally found that including language modeling as an auxiliary objective to the fine-tuning
|
165 |
+
helped learning by (a) improving generalization of the supervised model, and (b) accelerating
|
166 |
+
convergence. This is in line with prior work [50, 43], who also observed improved performance with
|
167 |
+
such an auxiliary objective. Specifically, we optimize the following objective (with weight Ξ»):
|
168 |
+
L3(C) = L2(C) + Ξ» β L1(C) (5)
|
169 |
+
Overall, the only extra parameters we require during fine-tuning are Wy, and embeddings for delimiter
|
170 |
+
tokens (described below in Section 3.3).
|
171 |
+
3
|
172 |
+
Figure 1: (left) Transformer architecture and training objectives used in this work. (right) Input
|
173 |
+
transformations for fine-tuning on different tasks. We convert all structured inputs into token
|
174 |
+
sequences to be processed by our pre-trained model, followed by a linear+softmax layer.
|
175 |
+
3.3 Task-specific input transformations
|
176 |
+
For some tasks, like text classification, we can directly fine-tune our model as described above.
|
177 |
+
Certain other tasks, like question answering or textual entailment, have structured inputs such as
|
178 |
+
ordered sentence pairs, or triplets of document, question, and answers. Since our pre-trained model
|
179 |
+
was trained on contiguous sequences of text, we require some modifications to apply it to these tasks.
|
180 |
+
Previous work proposed learning task specific architectures on top of transferred representations [44].
|
181 |
+
Such an approach re-introduces a significant amount of task-specific customization and does not
|
182 |
+
use transfer learning for these additional architectural components. Instead, we use a traversal-style
|
183 |
+
approach [52], where we convert structured inputs into an ordered sequence that our pre-trained
|
184 |
+
model can process. These input transformations allow us to avoid making extensive changes to the
|
185 |
+
architecture across tasks. We provide a brief description of these input transformations below and
|
186 |
+
Figure 1 provides a visual illustration. All transformations include adding randomly initialized start
|
187 |
+
and end tokens (hsi, hei).
|
188 |
+
Textual entailment For entailment tasks, we concatenate the premise p and hypothesis h token
|
189 |
+
sequences, with a delimiter token ($) in between.
|
190 |
+
Similarity For similarity tasks, there is no inherent ordering of the two sentences being compared.
|
191 |
+
To reflect this, we modify the input sequence to contain both possible sentence orderings (with a
|
192 |
+
delimiter in between) and process each independently to produce two sequence representations h
|
193 |
+
m
|
194 |
+
l
|
195 |
+
which are added element-wise before being fed into the linear output layer.
|
196 |
+
Question Answering and Commonsense Reasoning For these tasks, we are given a context
|
197 |
+
document z, a question q, and a set of possible answers {ak}. We concatenate the document context
|
198 |
+
and question with each possible answer, adding a delimiter token in between to get [z; q; $; ak]. Each
|
199 |
+
of these sequences are processed independently with our model and then normalized via a softmax
|
200 |
+
layer to produce an output distribution over possible answers.
|
201 |
+
4 Experiments
|
202 |
+
4.1 Setup
|
203 |
+
Unsupervised pre-training We use the BooksCorpus dataset [71] for training the language model.
|
204 |
+
It contains over 7,000 unique unpublished books from a variety of genres including Adventure,
|
205 |
+
Fantasy, and Romance. Crucially, it contains long stretches of contiguous text, which allows the
|
206 |
+
generative model to learn to condition on long-range information. An alternative dataset, the 1B
|
207 |
+
Word Benchmark, which is used by a similar approach, ELMo [44], is approximately the same size
|
208 |
+
4
|
209 |
+
Table 1: A list of the different tasks and datasets used in our experiments.
|
210 |
+
Task Datasets
|
211 |
+
Natural language inference SNLI [5], MultiNLI [66], Question NLI [64], RTE [4], SciTail [25]
|
212 |
+
Question Answering RACE [30], Story Cloze [40]
|
213 |
+
Sentence similarity MSR Paraphrase Corpus [14], Quora Question Pairs [9], STS Benchmark [6]
|
214 |
+
Classification Stanford Sentiment Treebank-2 [54], CoLA [65]
|
215 |
+
but is shuffled at a sentence level - destroying long-range structure. Our language model achieves a
|
216 |
+
very low token level perplexity of 18.4 on this corpus.
|
217 |
+
Model specifications Our model largely follows the original transformer work [62]. We trained a
|
218 |
+
12-layer decoder-only transformer with masked self-attention heads (768 dimensional states and 12
|
219 |
+
attention heads). For the position-wise feed-forward networks, we used 3072 dimensional inner states.
|
220 |
+
We used the Adam optimization scheme [27] with a max learning rate of 2.5e-4. The learning rate
|
221 |
+
was increased linearly from zero over the first 2000 updates and annealed to 0 using a cosine schedule.
|
222 |
+
We train for 100 epochs on minibatches of 64 randomly sampled, contiguous sequences of 512 tokens.
|
223 |
+
Since layernorm [2] is used extensively throughout the model, a simple weight initialization of
|
224 |
+
N(0, 0.02) was sufficient. We used a bytepair encoding (BPE) vocabulary with 40,000 merges [53]
|
225 |
+
and residual, embedding, and attention dropouts with a rate of 0.1 for regularization. We also
|
226 |
+
employed a modified version of L2 regularization proposed in [37], with w = 0.01 on all non bias or
|
227 |
+
gain weights. For the activation function, we used the Gaussian Error Linear Unit (GELU) [18]. We
|
228 |
+
used learned position embeddings instead of the sinusoidal version proposed in the original work.
|
229 |
+
We use the ftfy library2
|
230 |
+
to clean the raw text in BooksCorpus, standardize some punctuation and
|
231 |
+
whitespace, and use the spaCy tokenizer.3
|
232 |
+
Fine-tuning details Unless specified, we reuse the hyperparameter settings from unsupervised
|
233 |
+
pre-training. We add dropout to the classifier with a rate of 0.1. For most tasks, we use a learning rate
|
234 |
+
of 6.25e-5 and a batchsize of 32. Our model finetunes quickly and 3 epochs of training was sufficient
|
235 |
+
for most cases. We use a linear learning rate decay schedule with warmup over 0.2% of training. Ξ»
|
236 |
+
was set to 0.5.
|
237 |
+
4.2 Supervised fine-tuning
|
238 |
+
We perform experiments on a variety of supervised tasks including natural language inference,
|
239 |
+
question answering, semantic similarity, and text classification. Some of these tasks are available
|
240 |
+
as part of the recently released GLUE multi-task benchmark [64], which we make use of. Figure 1
|
241 |
+
provides an overview of all the tasks and datasets.
|
242 |
+
Natural Language Inference The task of natural language inference (NLI), also known as recognizing textual entailment, involves reading a pair of sentences and judging the relationship between
|
243 |
+
them from one of entailment, contradiction or neutral. Although there has been a lot of
|
244 |
+
recent interest [58, 35, 44], the task remains challenging due to the presence of a wide variety of
|
245 |
+
phenomena like lexical entailment, coreference, and lexical and syntactic ambiguity. We evaluate
|
246 |
+
on five datasets with diverse sources, including image captions (SNLI), transcribed speech, popular
|
247 |
+
fiction, and government reports (MNLI), Wikipedia articles (QNLI), science exams (SciTail) or news
|
248 |
+
articles (RTE).
|
249 |
+
Table 2 details various results on the different NLI tasks for our model and previous state-of-the-art
|
250 |
+
approaches. Our method significantly outperforms the baselines on four of the five datasets, achieving
|
251 |
+
absolute improvements of upto 1.5% on MNLI, 5% on SciTail, 5.8% on QNLI and 0.6% on SNLI
|
252 |
+
over the previous best results. This demonstrates our modelβs ability to better reason over multiple
|
253 |
+
sentences, and handle aspects of linguistic ambiguity. On RTE, one of the smaller datasets we
|
254 |
+
evaluate on (2490 examples), we achieve an accuracy of 56%, which is below the 61.7% reported by a
|
255 |
+
multi-task biLSTM model. Given the strong performance of our approach on larger NLI datasets, it is
|
256 |
+
likely our model will benefit from multi-task training as well but we have not explored this currently.
|
257 |
+
2
|
258 |
+
https://ftfy.readthedocs.io/en/latest/
|
259 |
+
3
|
260 |
+
https://spacy.io/
|
261 |
+
5
|
262 |
+
Table 2: Experimental results on natural language inference tasks, comparing our model with current
|
263 |
+
state-of-the-art methods. 5x indicates an ensemble of 5 models. All datasets use accuracy as the
|
264 |
+
evaluation metric.
|
265 |
+
Method MNLI-m MNLI-mm SNLI SciTail QNLI RTE
|
266 |
+
ESIM + ELMo [44] (5x) - - 89.3 - - -
|
267 |
+
CAFE [58] (5x) 80.2 79.0 89.3 - - -
|
268 |
+
Stochastic Answer Network [35] (3x) 80.6 80.1 - - - -
|
269 |
+
CAFE [58] 78.7 77.9 88.5 83.3
|
270 |
+
GenSen [64] 71.4 71.3 - - 82.3 59.2
|
271 |
+
Multi-task BiLSTM + Attn [64] 72.2 72.1 - - 82.1 61.7
|
272 |
+
Finetuned Transformer LM (ours) 82.1 81.4 89.9 88.3 88.1 56.0
|
273 |
+
Table 3: Results on question answering and commonsense reasoning, comparing our model with
|
274 |
+
current state-of-the-art methods.. 9x means an ensemble of 9 models.
|
275 |
+
Method Story Cloze RACE-m RACE-h RACE
|
276 |
+
val-LS-skip [55] 76.5 - - -
|
277 |
+
Hidden Coherence Model [7] 77.6 - - -
|
278 |
+
Dynamic Fusion Net [67] (9x) - 55.6 49.4 51.2
|
279 |
+
BiAttention MRU [59] (9x) - 60.2 50.3 53.3
|
280 |
+
Finetuned Transformer LM (ours) 86.5 62.9 57.4 59.0
|
281 |
+
Question answering and commonsense reasoning Another task that requires aspects of single
|
282 |
+
and multi-sentence reasoning is question answering. We use the recently released RACE dataset [30],
|
283 |
+
consisting of English passages with associated questions from middle and high school exams. This
|
284 |
+
corpus has been shown to contain more reasoning type questions that other datasets like CNN [19] or
|
285 |
+
SQuaD [47], providing the perfect evaluation for our model which is trained to handle long-range
|
286 |
+
contexts. In addition, we evaluate on the Story Cloze Test [40], which involves selecting the correct
|
287 |
+
ending to multi-sentence stories from two options. On these tasks, our model again outperforms the
|
288 |
+
previous best results by significant margins - up to 8.9% on Story Cloze, and 5.7% overall on RACE.
|
289 |
+
This demonstrates the ability of our model to handle long-range contexts effectively.
|
290 |
+
Semantic Similarity Semantic similarity (or paraphrase detection) tasks involve predicting whether
|
291 |
+
two sentences are semantically equivalent or not. The challenges lie in recognizing rephrasing of
|
292 |
+
concepts, understanding negation, and handling syntactic ambiguity. We use three datasets for this
|
293 |
+
task β the Microsoft Paraphrase corpus (MRPC) [14] (collected from news sources), the Quora
|
294 |
+
Question Pairs (QQP) dataset [9], and the Semantic Textual Similarity benchmark (STS-B) [6].
|
295 |
+
We obtain state-of-the-art results on two of the three semantic similarity tasks (Table 4) with a 1
|
296 |
+
point absolute gain on STS-B. The performance delta on QQP is significant, with a 4.2% absolute
|
297 |
+
improvement over Single-task BiLSTM + ELMo + Attn.
|
298 |
+
Classification Finally, we also evaluate on two different text classification tasks. The Corpus
|
299 |
+
of Linguistic Acceptability (CoLA) [65] contains expert judgements on whether a sentence is
|
300 |
+
grammatical or not, and tests the innate linguistic bias of trained models. The Stanford Sentiment
|
301 |
+
Treebank (SST-2) [54], on the other hand, is a standard binary classification task. Our model obtains
|
302 |
+
an score of 45.4 on CoLA, which is an especially big jump over the previous best result of 35.0,
|
303 |
+
showcasing the innate linguistic bias learned by our model. The model also achieves 91.3% accuracy
|
304 |
+
on SST-2, which is competitive with the state-of-the-art results. We also achieve an overall score of
|
305 |
+
72.8 on the GLUE benchmark, which is significantly better than the previous best of 68.9.
|
306 |
+
6
|
307 |
+
Table 4: Semantic similarity and classification results, comparing our model with current state-of-theart methods. All task evaluations in this table were done using the GLUE benchmark. (mc= Mathews
|
308 |
+
correlation, acc=Accuracy, pc=Pearson correlation)
|
309 |
+
Method Classification Semantic Similarity GLUE
|
310 |
+
CoLA SST2 MRPC STSB QQP
|
311 |
+
(mc) (acc) (F1) (pc) (F1)
|
312 |
+
Sparse byte mLSTM [16] - 93.2 - - - -
|
313 |
+
TF-KLD [23] - - 86.0 - - -
|
314 |
+
ECNU (mixed ensemble) [60] - - - 81.0 - -
|
315 |
+
Single-task BiLSTM + ELMo + Attn [64] 35.0 90.2 80.2 55.5 66.1 64.8
|
316 |
+
Multi-task BiLSTM + ELMo + Attn [64] 18.9 91.6 83.5 72.8 63.3 68.9
|
317 |
+
Finetuned Transformer LM (ours) 45.4 91.3 82.3 82.0 70.3 72.8
|
318 |
+
Overall, our approach achieves new state-of-the-art results in 9 out of the 12 datasets we evaluate
|
319 |
+
on, outperforming ensembles in many cases. Our results also indicate that our approach works well
|
320 |
+
across datasets of different sizes, from smaller datasets such as STS-B (β5.7k training examples) β
|
321 |
+
to the largest one β SNLI (β550k training examples).
|
322 |
+
5 Analysis
|
323 |
+
Impact of number of layers transferred We observed the impact of transferring a variable number
|
324 |
+
of layers from unsupervised pre-training to the supervised target task. Figure 2(left) illustrates the
|
325 |
+
performance of our approach on MultiNLI and RACE as a function of the number of layers transferred.
|
326 |
+
We observe the standard result that transferring embeddings improves performance and that each
|
327 |
+
transformer layer provides further benefits up to 9% for full transfer on MultiNLI. This indicates that
|
328 |
+
each layer in the pre-trained model contains useful functionality for solving target tasks.
|
329 |
+
Figure 2: (left) Effect of transferring increasing number of layers from the pre-trained language
|
330 |
+
model on RACE and MultiNLI. (right) Plot showing the evolution of zero-shot performance on
|
331 |
+
different tasks as a function of LM pre-training updates. Performance per task is normalized between
|
332 |
+
a random guess baseline and the current state-of-the-art with a single model.
|
333 |
+
Zero-shot Behaviors Weβd like to better understand why language model pre-training of transformers is effective. A hypothesis is that the underlying generative model learns to perform many of the
|
334 |
+
tasks we evaluate on in order to improve its language modeling capability and that the more structured
|
335 |
+
7
|
336 |
+
Table 5: Analysis of various model ablations on different tasks. Avg. score is a unweighted average
|
337 |
+
of all the results. (mc= Mathews correlation, acc=Accuracy, pc=Pearson correlation)
|
338 |
+
Method Avg. Score CoLA SST2 MRPC STSB QQP MNLI QNLI RTE
|
339 |
+
(mc) (acc) (F1) (pc) (F1) (acc) (acc) (acc)
|
340 |
+
Transformer w/ aux LM (full) 74.7 45.4 91.3 82.3 82.0 70.3 81.8 88.1 56.0
|
341 |
+
Transformer w/o pre-training 59.9 18.9 84.0 79.4 30.9 65.5 75.7 71.2 53.8
|
342 |
+
Transformer w/o aux LM 75.0 47.9 92.0 84.9 83.2 69.8 81.1 86.9 54.4
|
343 |
+
LSTM w/ aux LM 69.1 30.3 90.5 83.2 71.8 68.1 73.7 81.1 54.6
|
344 |
+
attentional memory of the transformer assists in transfer compared to LSTMs. We designed a series
|
345 |
+
of heuristic solutions that use the underlying generative model to perform tasks without supervised
|
346 |
+
finetuning. We visualize the effectiveness of these heuristic solutions over the course of generative
|
347 |
+
pre-training in Fig 2(right). We observe the performance of these heuristics is stable and steadily
|
348 |
+
increases over training suggesting that generative pretraining supports the learning of a wide variety
|
349 |
+
of task relevant functionality. We also observe the LSTM exhibits higher variance in its zero-shot
|
350 |
+
performance suggesting that the inductive bias of the Transformer architecture assists in transfer.
|
351 |
+
For CoLA (linguistic acceptability), examples are scored as the average token log-probability the
|
352 |
+
generative model assigns and predictions are made by thresholding. For SST-2 (sentiment analysis),
|
353 |
+
we append the token very to each example and restrict the language modelβs output distribution to only
|
354 |
+
the words positive and negative and guess the token it assigns higher probability to as the prediction.
|
355 |
+
For RACE (question answering), we pick the answer the generative model assigns the highest average
|
356 |
+
token log-probability when conditioned on the document and question. For DPRD [46] (winograd
|
357 |
+
schemas), we replace the definite pronoun with the two possible referrents and predict the resolution
|
358 |
+
that the generative model assigns higher average token log-probability to the rest of the sequence
|
359 |
+
after the substitution.
|
360 |
+
Ablation studies We perform three different ablation studies (Table 5). First, we examine the
|
361 |
+
performance of our method without the auxiliary LM objective during fine-tuning. We observe that
|
362 |
+
the auxiliary objective helps on the NLI tasks and QQP. Overall, the trend suggests that larger datasets
|
363 |
+
benefit from the auxiliary objective but smaller datasets do not. Second, we analyze the effect of the
|
364 |
+
Transformer by comparing it with a single layer 2048 unit LSTM using the same framework. We
|
365 |
+
observe a 5.6 average score drop when using the LSTM instead of the Transformer. The LSTM only
|
366 |
+
outperforms the Transformer on one dataset β MRPC. Finally, we also compare with our transformer
|
367 |
+
architecture directly trained on supervised target tasks, without pre-training. We observe that the lack
|
368 |
+
of pre-training hurts performance across all the tasks, resulting in a 14.8% decrease compared to our
|
369 |
+
full model.
|
370 |
+
6 Conclusion
|
371 |
+
We introduced a framework for achieving strong natural language understanding with a single
|
372 |
+
task-agnostic model through generative pre-training and discriminative fine-tuning. By pre-training
|
373 |
+
on a diverse corpus with long stretches of contiguous text our model acquires significant world
|
374 |
+
knowledge and ability to process long-range dependencies which are then successfully transferred to
|
375 |
+
solving discriminative tasks such as question answering, semantic similarity assessment, entailment
|
376 |
+
determination, and text classification, improving the state of the art on 9 of the 12 datasets we
|
377 |
+
study. Using unsupervised (pre-)training to boost performance on discriminative tasks has long
|
378 |
+
been an important goal of Machine Learning research. Our work suggests that achieving significant
|
379 |
+
performance gains is indeed possible, and offers hints as to what models (Transformers) and data sets
|
380 |
+
(text with long range dependencies) work best with this approach. We hope that this will help enable
|
381 |
+
new research into unsupervised learning, for both natural language understanding and other domains,
|
382 |
+
further improving our understanding of how and when unsupervised learning works.
|
383 |
+
References
|
384 |
+
[1] S. Arora, Y. Liang, and T. Ma. A simple but tough-to-beat baseline for sentence embeddings. 2016.
|
385 |
+
8
|
386 |
+
[2] J. L. Ba, J. R. Kiros, and G. E. Hinton. Layer normalization. arXiv preprint arXiv:1607.06450, 2016.
|
387 |
+
[3] Y. Bengio, P. Lamblin, D. Popovici, and H. Larochelle. Greedy layer-wise training of deep networks. In
|
388 |
+
Advances in neural information processing systems, pages 153β160, 2007.
|
389 |
+
[4] L. Bentivogli, P. Clark, I. Dagan, and D. Giampiccolo. The fifth pascal recognizing textual entailment
|
390 |
+
challenge. In TAC, 2009.
|
391 |
+
[5] S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. A large annotated corpus for learning natural
|
392 |
+
language inference. EMNLP, 2015.
|
393 |
+
[6] D. Cer, M. Diab, E. Agirre, I. Lopez-Gazpio, and L. Specia. Semeval-2017 task 1: Semantic textual
|
394 |
+
similarity-multilingual and cross-lingual focused evaluation. arXiv preprint arXiv:1708.00055, 2017.
|
395 |
+
[7] S. Chaturvedi, H. Peng, and D. Roth. Story comprehension for predicting what happens next. In Proceedings
|
396 |
+
of the 2017 Conference on Empirical Methods in Natural Language Processing, pages 1603β1614, 2017.
|
397 |
+
[8] D. Chen and C. Manning. A fast and accurate dependency parser using neural networks. In Proceedings
|
398 |
+
of the 2014 conference on empirical methods in natural language processing (EMNLP), pages 740β750,
|
399 |
+
2014.
|
400 |
+
[9] Z. Chen, H. Zhang, X. Zhang, and L. Zhao. Quora question pairs. https://data.quora.com/First-QuoraDataset-Release-Question-Pairs, 2018.
|
401 |
+
[10] R. Collobert and J. Weston. A unified architecture for natural language processing: Deep neural networks
|
402 |
+
with multitask learning. In Proceedings of the 25th international conference on Machine learning, pages
|
403 |
+
160β167. ACM, 2008.
|
404 |
+
[11] R. Collobert, J. Weston, L. Bottou, M. Karlen, K. Kavukcuoglu, and P. Kuksa. Natural language processing
|
405 |
+
(almost) from scratch. Journal of Machine Learning Research, 12(Aug):2493β2537, 2011.
|
406 |
+
[12] A. Conneau, D. Kiela, H. Schwenk, L. Barrault, and A. Bordes. Supervised learning of universal sentence
|
407 |
+
representations from natural language inference data. EMNLP, 2017.
|
408 |
+
[13] A. M. Dai and Q. V. Le. Semi-supervised sequence learning. In Advances in Neural Information Processing
|
409 |
+
Systems, pages 3079β3087, 2015.
|
410 |
+
[14] W. B. Dolan and C. Brockett. Automatically constructing a corpus of sentential paraphrases. In Proceedings
|
411 |
+
of the Third International Workshop on Paraphrasing (IWP2005), 2005.
|
412 |
+
[15] D. Erhan, Y. Bengio, A. Courville, P.-A. Manzagol, P. Vincent, and S. Bengio. Why does unsupervised
|
413 |
+
pre-training help deep learning? Journal of Machine Learning Research, 11(Feb):625β660, 2010.
|
414 |
+
[16] S. Gray, A. Radford, and K. P. Diederik. Gpu kernels for block-sparse weights. 2017.
|
415 |
+
[17] Z. He, S. Liu, M. Li, M. Zhou, L. Zhang, and H. Wang. Learning entity representation for entity disambiguation. In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics
|
416 |
+
(Volume 2: Short Papers), volume 2, pages 30β34, 2013.
|
417 |
+
[18] D. Hendrycks and K. Gimpel. Bridging nonlinearities and stochastic regularizers with gaussian error linear
|
418 |
+
units. arXiv preprint arXiv:1606.08415, 2016.
|
419 |
+
[19] K. M. Hermann, T. Kocisky, E. Grefenstette, L. Espeholt, W. Kay, M. Suleyman, and P. Blunsom. Teaching
|
420 |
+
machines to read and comprehend. In Advances in Neural Information Processing Systems, pages 1693β
|
421 |
+
1701, 2015.
|
422 |
+
[20] G. E. Hinton, S. Osindero, and Y.-W. Teh. A fast learning algorithm for deep belief nets. Neural
|
423 |
+
computation, 18(7):1527β1554, 2006.
|
424 |
+
[21] J. Howard and S. Ruder. Universal language model fine-tuning for text classification. Association for
|
425 |
+
Computational Linguistics (ACL), 2018.
|
426 |
+
[22] Y. Jernite, S. R. Bowman, and D. Sontag. Discourse-based objectives for fast unsupervised sentence
|
427 |
+
representation learning. arXiv preprint arXiv:1705.00557, 2017.
|
428 |
+
[23] Y. Ji and J. Eisenstein. Discriminative improvements to distributional sentence similarity. In Proceedings
|
429 |
+
of the 2013 Conference on Empirical Methods in Natural Language Processing, pages 891β896, 2013.
|
430 |
+
9
|
431 |
+
[24] F. Jiao, S. Wang, C.-H. Lee, R. Greiner, and D. Schuurmans. Semi-supervised conditional random fields
|
432 |
+
for improved sequence segmentation and labeling. In Proceedings of the 21st International Conference on
|
433 |
+
Computational Linguistics and the 44th annual meeting of the Association for Computational Linguistics,
|
434 |
+
pages 209β216. Association for Computational Linguistics, 2006.
|
435 |
+
[25] T. Khot, A. Sabharwal, and P. Clark. Scitail: A textual entailment dataset from science question answering.
|
436 |
+
In Proceedings of AAAI, 2018.
|
437 |
+
[26] Y. Kim. Convolutional neural networks for sentence classification. EMNLP, 2014.
|
438 |
+
[27] D. P. Kingma and J. Ba. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980,
|
439 |
+
2014.
|
440 |
+
[28] R. Kiros, Y. Zhu, R. R. Salakhutdinov, R. Zemel, R. Urtasun, A. Torralba, and S. Fidler. Skip-thought
|
441 |
+
vectors. In Advances in neural information processing systems, pages 3294β3302, 2015.
|
442 |
+
[29] N. Kitaev and D. Klein. Constituency parsing with a self-attentive encoder. ACL, 2018.
|
443 |
+
[30] G. Lai, Q. Xie, H. Liu, Y. Yang, and E. Hovy. Race: Large-scale reading comprehension dataset from
|
444 |
+
examinations. EMNLP, 2017.
|
445 |
+
[31] G. Lample, L. Denoyer, and M. Ranzato. Unsupervised machine translation using monolingual corpora
|
446 |
+
only. ICLR, 2018.
|
447 |
+
[32] Q. Le and T. Mikolov. Distributed representations of sentences and documents. In International Conference
|
448 |
+
on Machine Learning, pages 1188β1196, 2014.
|
449 |
+
[33] P. Liang. Semi-supervised learning for natural language. PhD thesis, Massachusetts Institute of Technology,
|
450 |
+
2005.
|
451 |
+
[34] P. J. Liu, M. Saleh, E. Pot, B. Goodrich, R. Sepassi, L. Kaiser, and N. Shazeer. Generating wikipedia by
|
452 |
+
summarizing long sequences. ICLR, 2018.
|
453 |
+
[35] X. Liu, K. Duh, and J. Gao. Stochastic answer networks for natural language inference. arXiv preprint
|
454 |
+
arXiv:1804.07888, 2018.
|
455 |
+
[36] L. Logeswaran and H. Lee. An efficient framework for learning sentence representations. ICLR, 2018.
|
456 |
+
[37] I. Loshchilov and F. Hutter. Fixing weight decay regularization in adam. arXiv preprint arXiv:1711.05101,
|
457 |
+
2017.
|
458 |
+
[38] B. McCann, J. Bradbury, C. Xiong, and R. Socher. Learned in translation: Contextualized word vectors. In
|
459 |
+
Advances in Neural Information Processing Systems, pages 6297β6308, 2017.
|
460 |
+
[39] T. Mikolov, I. Sutskever, K. Chen, G. S. Corrado, and J. Dean. Distributed representations of words
|
461 |
+
and phrases and their compositionality. In Advances in neural information processing systems, pages
|
462 |
+
3111β3119, 2013.
|
463 |
+
[40] N. Mostafazadeh, M. Roth, A. Louis, N. Chambers, and J. Allen. Lsdsem 2017 shared task: The story cloze
|
464 |
+
test. In Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level
|
465 |
+
Semantics, pages 46β51, 2017.
|
466 |
+
[41] K. Nigam, A. McCallum, and T. Mitchell. Semi-supervised text classification using em. Semi-Supervised
|
467 |
+
Learning, pages 33β56, 2006.
|
468 |
+
[42] J. Pennington, R. Socher, and C. Manning. Glove: Global vectors for word representation. In Proceedings
|
469 |
+
of the 2014 conference on empirical methods in natural language processing (EMNLP), pages 1532β1543,
|
470 |
+
2014.
|
471 |
+
[43] M. E. Peters, W. Ammar, C. Bhagavatula, and R. Power. Semi-supervised sequence tagging with bidirectional language models. ACL, 2017.
|
472 |
+
[44] M. E. Peters, M. Neumann, M. Iyyer, M. Gardner, C. Clark, K. Lee, and L. Zettlemoyer. Deep contextualized word representations. NAACL, 2018.
|
473 |
+
[45] Y. Qi, D. S. Sachan, M. Felix, S. J. Padmanabhan, and G. Neubig. When and why are pre-trained word
|
474 |
+
embeddings useful for neural machine translation? NAACL, 2018.
|
475 |
+
10
|
476 |
+
[46] A. Rahman and V. Ng. Resolving complex cases of definite pronouns: the winograd schema challenge. In
|
477 |
+
Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and
|
478 |
+
Computational Natural Language Learning, pages 777β789. Association for Computational Linguistics,
|
479 |
+
2012.
|
480 |
+
[47] P. Rajpurkar, J. Zhang, K. Lopyrev, and P. Liang. Squad: 100,000+ questions for machine comprehension
|
481 |
+
of text. EMNLP, 2016.
|
482 |
+
[48] P. Ramachandran, P. J. Liu, and Q. V. Le. Unsupervised pretraining for sequence to sequence learning.
|
483 |
+
arXiv preprint arXiv:1611.02683, 2016.
|
484 |
+
[49] M. Ranzato, C. Poultney, S. Chopra, and Y. LeCun. Efficient learning of sparse representations with an
|
485 |
+
energy-based model. In Advances in neural information processing systems, pages 1137β1144, 2007.
|
486 |
+
[50] M. Rei. Semi-supervised multitask learning for sequence labeling. ACL, 2017.
|
487 |
+
[51] H. Robbins and S. Monro. A stochastic approximation method. The annals of mathematical statistics,
|
488 |
+
pages 400β407, 1951.
|
489 |
+
[52] T. RocktΓ€schel, E. Grefenstette, K. M. Hermann, T. Kocisk Λ y, and P. Blunsom. Reasoning about entailment `
|
490 |
+
with neural attention. arXiv preprint arXiv:1509.06664, 2015.
|
491 |
+
[53] R. Sennrich, B. Haddow, and A. Birch. Neural machine translation of rare words with subword units. arXiv
|
492 |
+
preprint arXiv:1508.07909, 2015.
|
493 |
+
[54] R. Socher, A. Perelygin, J. Wu, J. Chuang, C. D. Manning, A. Ng, and C. Potts. Recursive deep models for
|
494 |
+
semantic compositionality over a sentiment treebank. In Proceedings of the 2013 conference on empirical
|
495 |
+
methods in natural language processing, pages 1631β1642, 2013.
|
496 |
+
[55] S. Srinivasan, R. Arora, and M. Riedl. A simple and effective approach to the story cloze test. arXiv
|
497 |
+
preprint arXiv:1803.05547, 2018.
|
498 |
+
[56] S. Subramanian, A. Trischler, Y. Bengio, and C. J. Pal. Learning general purpose distributed sentence
|
499 |
+
representations via large scale multi-task learning. arXiv preprint arXiv:1804.00079, 2018.
|
500 |
+
[57] J. Suzuki and H. Isozaki. Semi-supervised sequential labeling and segmentation using giga-word scale
|
501 |
+
unlabeled data. Proceedings of ACL-08: HLT, pages 665β673, 2008.
|
502 |
+
[58] Y. Tay, L. A. Tuan, and S. C. Hui. A compare-propagate architecture with alignment factorization for
|
503 |
+
natural language inference. arXiv preprint arXiv:1801.00102, 2017.
|
504 |
+
[59] Y. Tay, L. A. Tuan, and S. C. Hui. Multi-range reasoning for machine comprehension. arXiv preprint
|
505 |
+
arXiv:1803.09074, 2018.
|
506 |
+
[60] J. Tian, Z. Zhou, M. Lan, and Y. Wu. Ecnu at semeval-2017 task 1: Leverage kernel-based traditional nlp
|
507 |
+
features and neural networks to build a universal model for multilingual and cross-lingual semantic textual
|
508 |
+
similarity. In Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017),
|
509 |
+
pages 191β197, 2017.
|
510 |
+
[61] Y. Tsvetkov. Opportunities and challenges in working with low-resource languages. CMU, 2017.
|
511 |
+
[62] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, Ε. Kaiser, and I. Polosukhin.
|
512 |
+
Attention is all you need. In Advances in Neural Information Processing Systems, pages 6000β6010, 2017.
|
513 |
+
[63] P. Vincent, H. Larochelle, Y. Bengio, and P.-A. Manzagol. Extracting and composing robust features with
|
514 |
+
denoising autoencoders. In Proceedings of the 25th international conference on Machine learning, pages
|
515 |
+
1096β1103. ACM, 2008.
|
516 |
+
[64] A. Wang, A. Singh, J. Michael, F. Hill, O. Levy, and S. R. Bowman. Glue: A multi-task benchmark and
|
517 |
+
analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461, 2018.
|
518 |
+
[65] A. Warstadt, A. Singh, and S. R. Bowman. Corpus of linguistic acceptability. http://nyu-mll.github.io/cola,
|
519 |
+
2018.
|
520 |
+
[66] A. Williams, N. Nangia, and S. R. Bowman. A broad-coverage challenge corpus for sentence understanding
|
521 |
+
through inference. NAACL, 2018.
|
522 |
+
[67] Y. Xu, J. Liu, J. Gao, Y. Shen, and X. Liu. Towards human-level machine reading comprehension:
|
523 |
+
Reasoning and inference with multiple strategies. arXiv preprint arXiv:1711.04964, 2017.
|
524 |
+
11
|
525 |
+
[68] D. Yu, L. Deng, and G. Dahl. Roles of pre-training and fine-tuning in context-dependent dbn-hmms for
|
526 |
+
real-world speech recognition. In Proc. NIPS Workshop on Deep Learning and Unsupervised Feature
|
527 |
+
Learning, 2010.
|
528 |
+
[69] R. Zhang, P. Isola, and A. A. Efros. Split-brain autoencoders: Unsupervised learning by cross-channel
|
529 |
+
prediction. In CVPR, volume 1, page 6, 2017.
|
530 |
+
[70] X. Zhu. Semi-supervised learning literature survey. 2005.
|
531 |
+
[71] Y. Zhu, R. Kiros, R. Zemel, R. Salakhutdinov, R. Urtasun, A. Torralba, and S. Fidler. Aligning books and
|
532 |
+
movies: Towards story-like visual explanations by watching movies and reading books. In Proceedings of
|
533 |
+
the IEEE international conference on computer vision, pages 19β27, 2015.
|
534 |
+
12
|
535 |
+
"""
|