Rapid-Textual-Adversarial-Defense / textattack /attack_recipes /faster_genetic_algorithm_jia_2019.py
anonymous8/RPD-Demo
initial commit
4943752
"""
Faster Alzantot Genetic Algorithm
===================================
(Certified Robustness to Adversarial Word Substitutions)
"""
from textattack import Attack
from textattack.constraints.grammaticality.language_models import (
LearningToWriteLanguageModel,
)
from textattack.constraints.overlap import MaxWordsPerturbed
from textattack.constraints.pre_transformation import (
RepeatModification,
StopwordModification,
)
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.goal_functions import UntargetedClassification
from textattack.search_methods import AlzantotGeneticAlgorithm
from textattack.transformations import WordSwapEmbedding
from .attack_recipe import AttackRecipe
class FasterGeneticAlgorithmJia2019(AttackRecipe):
"""Certified Robustness to Adversarial Word Substitutions.
Robin Jia, Aditi Raghunathan, Kerem Göksel, Percy Liang (2019).
https://arxiv.org/pdf/1909.00986.pdf
"""
@staticmethod
def build(model_wrapper):
#
# Section 5: Experiments
#
# We base our sets of allowed word substitutions S(x, i) on the
# substitutions allowed by Alzantot et al. (2018). They demonstrated that
# their substitutions lead to adversarial examples that are qualitatively
# similar to the original input and retain the original label, as judged
# by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w
# as the n = 8 nearest neighbors of w in a “counter-fitted” word vector
# space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The
# neighbors must also lie within some Euclidean distance threshold. They
# also use a language model constraint to avoid nonsensical perturbations:
# they allow substituting xi with x˜i ∈ N(xi) if and only if it does not
# decrease the log-likelihood of the text under a pre-trained language
# model by more than some threshold.
#
# We make three modifications to this approach:
#
# First, in Alzantot et al. (2018), the adversary
# applies substitutions one at a time, and the
# neighborhoods and language model scores are computed.
# Equation (4) must be applied before the model
# can combine information from multiple words, but it can
# be delayed until after processing each word independently.
# Note that the model itself classifies using a different
# set of pre-trained word vectors; the counter-fitted vectors
# are only used to define the set of allowed substitution words.
# relative to the current altered version of the input.
# This results in a hard-to-define attack surface, as
# changing one word can allow or disallow changes
# to other words. It also requires recomputing
# language model scores at each iteration of the genetic
# attack, which is inefficient. Moreover, the same
# word can be substituted multiple times, leading
# to semantic drift. We define allowed substitutions
# relative to the original sentence x, and disallow
# repeated substitutions.
#
# Second, we use a faster language model that allows us to query
# longer contexts; Alzantot et al. (2018) use a slower language
# model and could only query it with short contexts.
# Finally, we use the language model constraint only
# at test time; the model is trained against all perturbations in N(w). This encourages the model to be
# robust to a larger space of perturbations, instead of
# specializing for the particular choice of language
# model. See Appendix A.3 for further details. [This is a model-specific
# adjustment, so does not affect the attack recipe.]
#
# Appendix A.3:
#
# In Alzantot et al. (2018), the adversary applies replacements one at a
# time, and the neighborhoods and language model scores are computed
# relative to the current altered version of the input. This results in a
# hard-to-define attack surface, as the same word can be replaced many
# times, leading to semantic drift. We instead pre-compute the allowed
# substitutions S(x, i) at index i based on the original x. We define
# S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are
# assigned by a pre-trained language model, and the window radius W and
# threshold δ are hyperparameters. We use W = 6 and δ = 5.
#
#
# Swap words with their embedding nearest-neighbors.
#
# Embedding: Counter-fitted Paragram Embeddings.
#
# "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
#
transformation = WordSwapEmbedding(max_candidates=8)
#
# Don't modify the same word twice or stopwords
#
constraints = [RepeatModification(), StopwordModification()]
#
# Maximum words perturbed percentage of 20%
#
constraints.append(MaxWordsPerturbed(max_percent=0.2))
#
# Maximum word embedding euclidean distance of 0.5.
#
constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
#
# Language Model
#
#
#
constraints.append(
LearningToWriteLanguageModel(
window_size=6, max_log_prob_diff=5.0, compare_against_original=True
)
)
# constraints.append(LearningToWriteLanguageModel(window_size=5))
#
# Goal is untargeted classification
#
goal_function = UntargetedClassification(model_wrapper)
#
# Perform word substitution with a genetic algorithm.
#
search_method = AlzantotGeneticAlgorithm(
pop_size=60, max_iters=40, post_crossover_check=False
)
return Attack(goal_function, constraints, transformation, search_method)