File size: 2,408 Bytes
4a1df2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""

HotFlip
===========
(HotFlip: White-Box Adversarial Examples for Text Classification)

"""
from textattack import Attack
from textattack.constraints.grammaticality import PartOfSpeech
from textattack.constraints.overlap import MaxWordsPerturbed
from textattack.constraints.pre_transformation import (
    RepeatModification,
    StopwordModification,
)
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.goal_functions import UntargetedClassification
from textattack.search_methods import BeamSearch
from textattack.transformations import WordSwapGradientBased

from .attack_recipe import AttackRecipe


class HotFlipEbrahimi2017(AttackRecipe):
    """Ebrahimi, J. et al. (2017)

    HotFlip: White-Box Adversarial Examples for Text Classification

    https://arxiv.org/abs/1712.06751

    This is a reproduction of the HotFlip word-level attack (section 5 of the
    paper).
    """

    @staticmethod
    def build(model_wrapper):
        #
        # "HotFlip ... uses the gradient with respect to a one-hot input
        # representation to efficiently estimate which individual change has the
        # highest estimated loss."
        transformation = WordSwapGradientBased(model_wrapper, top_n=1)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # 0. "We were able to create only 41 examples (2% of the correctly-
        # classified instances of the SST test set) with one or two flips."
        #
        constraints.append(MaxWordsPerturbed(max_num_words=2))
        #
        # 1. "The cosine similarity between the embedding of words is bigger than a
        #   threshold (0.8)."
        #
        constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
        #
        # 2. "The two words have the same part-of-speech."
        #
        constraints.append(PartOfSpeech())
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper)
        #
        # "HotFlip ... uses a beam search to find a set of manipulations that work
        # well together to confuse a classifier ... The adversary uses a beam size
        # of 10."
        #
        search_method = BeamSearch(beam_width=10)

        return Attack(goal_function, constraints, transformation, search_method)