In [1]:
#Import the libraries we know we'll need for the Generator.
import pandas as pd, spacy, nltk, numpy as np, re, ssl
from spacy import displacy
from spacy.matcher import Matcher
from nltk.corpus import wordnet
#!python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")
lemmatizer = nlp.get_pipe("lemmatizer")

#Import the libraries to support the model, predictions, and LIME.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

#Import the libraries for generating interactive visualizations.
import altair as alt

In [2]:
#Defining all necessary variables and instances.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
class_names = ['negative', 'positive']
explainer = LimeTextExplainer(class_names=class_names)

In [3]:
#Defining a Predictor required for LIME to function.
def predictor(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    probas = F.softmax(outputs.logits, dim=1).detach().numpy()
    return probas

In [4]:
# A simple function to pull synonyms and antonyms using spacy's POS
def syn_ant(word,POS=False,human=True):
    pos_options = ['NOUN','VERB','ADJ','ADV']
    synonyms = [] 
    antonyms = []
    #WordNet hates spaces so you have to remove them
    if " " in word:
        word = word.replace(" ", "_")
    
    if POS in pos_options:
        for syn in wordnet.synsets(word, pos=getattr(wordnet, POS)): 
            for l in syn.lemmas(): 
                current = l.name()
                if human:
                    current = re.sub("_"," ",current)
                synonyms.append(current) 
                if l.antonyms():
                    for ant in l.antonyms():
                        cur_ant = ant.name()
                        if human:
                            cur_ant = re.sub("_"," ",cur_ant)
                        antonyms.append(cur_ant)
    else: 
        for syn in wordnet.synsets(word): 
            for l in syn.lemmas(): 
                current = l.name()
                if human:
                    current = re.sub("_"," ",current)
                synonyms.append(current) 
                if l.antonyms():
                    for ant in l.antonyms():
                        cur_ant = ant.name()
                        if human:
                            cur_ant = re.sub("_"," ",cur_ant)
                        antonyms.append(cur_ant)
    synonyms = list(set(synonyms))
    antonyms = list(set(antonyms))
    return synonyms, antonyms

In [5]:
# Builds a list dynamically from WordNet using NLTK.
def wordnet_list(word,POS=False):
    word = word.lower()
    pos_options = ['NOUN','VERB','ADJ','ADV']
    synonyms, antonyms = syn_ant(word,POS,False)
    #print(synonyms, antonyms)
    base = []
    final = [word]
    #WordNet hates spaces so you have to remove them
    m_word = word.replace(" ", "_")
    
    if POS in pos_options:
        for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):
            base.extend(syn.hyponyms())
            base.append(syn)
        
        if len(synonyms) > 0:
            for w in synonyms:
                w = w.replace(" ","_")
                for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):
                    base.extend(syn.hyponyms())
                    base.append(syn)
        if len(antonyms) > 0:
            for a in antonyms:
                a = a.replace(" ","_")
                for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):
                    base.extend(syn.hyponyms())
                    base.append(syn)
    else:
        for syn in wordnet.synsets(m_word):
            base.extend(syn.hyponyms())
            base.append(syn)
        
        if len(synonyms) > 0:
            for w in synonyms:
                w = w.replace(" ","_")
                for syn in wordnet.synsets(w):
                    base.extend(syn.hyponyms())
                    base.append(syn)
        if len(antonyms) > 0:
            for a in antonyms:
                a = a.replace(" ","_")
                for syn in wordnet.synsets(a):
                    base.extend(syn.hyponyms())
                    base.append(syn)
    base = list(set(base))
    for b in base:
        cur_words = []
        cur_words.extend([re.sub("_"," ",lemma.name()) for lemma in b.lemmas()])
        final.extend(cur_words)

        
                
    final = list(set(final))    
    return final

In [6]:
def eval_pred_test(text, return_all = False):
    '''A basic function for evaluating the prediction from the model and turning it into a visualization friendly number.'''
    preds = pipe(text)
    neg_score = -1 * preds[0][0]['score']
    sent_neg = preds[0][0]['label']
    pos_score = preds[0][1]['score']
    sent_pos = preds[0][1]['label']
    prediction = 0
    sentiment = ''
    if pos_score > abs(neg_score):
        prediction = pos_score
        sentiment = sent_pos
    elif abs(neg_score) > pos_score:
        prediction = neg_score
        sentiment = sent_neg
        
    if return_all:
        return prediction, sentiment
    else:
        return prediction

In [7]:
def cf_from_wordnet_list(seed,text):
    seed_token = nlp(seed)
    seed_POS = seed_token[0].pos_
    #print(seed_POS)
    words = wordnet_list(seed,seed_POS)
    
    df = pd.DataFrame()
    df["Words"] = words
    df["Sentences"] = df.Words.apply(lambda x: re.sub(r'\b'+seed+r'\b',x,text))
    df["Similarity"] = df.Words.apply(lambda x: seed_token[0].similarity(nlp(x)[0]))
    df = df[df.Similarity > 0].reset_index()
    df.drop("index", axis=1, inplace=True)
    df["Prediction"] = df.Sentences.apply(eval_pred_test)
    #added this because I think it will make the end results better if we ensure the seed is in the data we generate counterfactuals from.
    df['Seed'] = df.Words.apply(lambda x: 'Seed' if x.lower() == seed else 'Alternative')
    return df
    

In [8]:
seed = "film"
text = f"This {seed} was filmed in Iraq."
text

'This film was filmed in Iraq.'

In [9]:
test = "I met a naked doctor."
testdoc = nlp(test)
displacy.render(testdoc, style="dep")

In [10]:
spacy.explain("amod")

'adjectival modifier'

In [11]:
cf_df = cf_from_wordnet_list(seed,text)

  df["Similarity"] = df.Words.apply(lambda x: seed_token[0].similarity(nlp(x)[0]))


In [12]:
cf_df.head()

Unnamed: 0,Words,Sentences,Similarity,Prediction,Seed
0,diorama,This diorama was filmed in Iraq.,0.127023,0.793419,Alternative
1,longshot,This longshot was filmed in Iraq.,0.050408,-0.991559,Alternative
2,musical comedy,This musical comedy was filmed in Iraq.,1.0,0.910803,Alternative
3,characterisation,This characterisation was filmed in Iraq.,0.216481,-0.987333,Alternative
4,Polaroid,This Polaroid was filmed in Iraq.,0.171456,-0.979913,Alternative


In [13]:
def max_min(df):
    maximum = df[df.Words != "girl"].Similarity.max()
    text3 = df.loc[df['Similarity'] == maximum, 'Words'].iloc[0]
    minimum = df.Similarity.min()
    text2 = df.loc[df['Similarity'] == minimum, 'Words'].iloc[0]
    return text2, text3

In [14]:
single_nearest = alt.selection_single(on='mouseover', nearest=True)
full = alt.Chart(cf_df).encode(
    alt.X('Similarity:Q'),  # specify nominal data
    alt.Y('Prediction:Q'),  # specify quantitative data
    color=alt.Color('Seed:N', legend=alt.Legend(title="Seed or Alternative")),
    size='Seed:N',
    tooltip=('Words','Prediction','Similarity')
).mark_circle(opacity=.5).properties(width=300).add_selection(single_nearest)

full

In [15]:
df2 = cf_df.nlargest(5, 'Prediction')
df3 = cf_df.nsmallest(5, 'Prediction')
df4 = cf_df[cf_df.Seed == "Seed"]
frames = [df2,df3,df4]
results = pd.concat(frames)

bar = alt.Chart(results).encode(  
    alt.X('Prediction:Q'), 
    alt.Y('Words:N', sort="-x"),
    color=alt.Color('Seed:N', legend=alt.Legend(title="Seed or Alternative")),
    size='Seed:N',
    tooltip=('Words','Prediction','Similarity')
).mark_circle().properties(width=300).add_selection(single_nearest)

bar

In [16]:
# Builds a list dynamically from WordNet using NLTK.
def wordnet_df(word,POS=False):
    pos_options = ['NOUN','VERB','ADJ','ADV']
    synonyms, antonyms = syn_ant(word,POS,False)
    words = []
    cats = []
    #WordNet hates spaces so you have to remove them
    m_word = word.replace(" ", "_")
    
    if POS in pos_options:
        for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):
                cur_lemmas = syn.lemmas()
                hypos = syn.hyponyms()
                for hypo in hypos:
                    cur_lemmas.extend(hypo.lemmas())
                for lemma in cur_lemmas:
                    ll = lemma.name()
                    cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                    words.append(re.sub("_"," ",ll))
        
        if len(synonyms) > 0:
            for w in synonyms:
                w = w.replace(" ","_")
                for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):
                    cur_lemmas = syn.lemmas()
                    hypos = syn.hyponyms()
                    for hypo in hypos:
                        cur_lemmas.extend(hypo.lemmas())
                    for lemma in cur_lemmas:
                        ll = lemma.name()
                        cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                        words.append(re.sub("_"," ",ll))
        if len(antonyms) > 0:
            for a in antonyms:
                a = a.replace(" ","_")
                for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):
                    cur_lemmas = syn.lemmas()
                    hypos = syn.hyponyms()
                    for hypo in hypos:
                        cur_lemmas.extend(hypo.lemmas())
                    for lemma in cur_lemmas:
                        ll = lemma.name()
                        cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                        words.append(re.sub("_"," ",ll))
    else:
        for syn in wordnet.synsets(m_word):
            cur_lemmas = syn.lemmas()
            hypos = syn.hyponyms()
            for hypo in hypos:
                cur_lemmas.extend(hypo.lemmas())
            for lemma in cur_lemmas:
                ll = lemma.name()
                cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                words.append(re.sub("_"," ",ll))
        
        if len(synonyms) > 0:
            for w in synonyms:
                w = w.replace(" ","_")
                for syn in wordnet.synsets(w):
                    cur_lemmas = syn.lemmas()
                    hypos = syn.hyponyms()
                    for hypo in hypos:
                        cur_lemmas.extend(hypo.lemmas())
                    for lemma in cur_lemmas:
                        ll = lemma.name()
                        cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                        words.append(re.sub("_"," ",ll))
        if len(antonyms) > 0:
            for a in antonyms:
                a = a.replace(" ","_")
                for syn in wordnet.synsets(a):
                    cur_lemmas = syn.lemmas()
                    hypos = syn.hyponyms()
                    for hypo in hypos:
                        cur_lemmas.extend(hypo.lemmas())
                    for lemma in cur_lemmas:
                        ll = lemma.name()
                        cats.append(re.sub("_"," ", syn.name().split(".")[0]))
                        words.append(re.sub("_"," ",ll))

    df = {"Categories":cats, "Words":words}
    df = pd.DataFrame(df) 
    df = df.drop_duplicates().reset_index()
    df = df.drop("index", axis=1)
    return df

In [17]:
def cf_from_wordnet_df(seed,text):
    seed_token = nlp(seed)
    seed_POS = seed_token[0].pos_
    print(seed_POS)
    df = wordnet_df(seed,seed_POS)
    
    df["Sentences"] = df.Words.apply(lambda x: re.sub(r'\b'+seed+r'\b',x,text))
    df["Word Similarity"] = df.Words.apply(lambda x: seed_token.similarity(nlp(x)))
    df = df[df["Word Similarity"] > 0].reset_index()
    df.drop("index", axis=1, inplace=True)
    df["Prediction"] = df.Sentences.apply(eval_pred_test)
    #added this because I think it will make the end results better if we ensure the seed is in the data we generate counterfactuals from.
    df['Seed'] = df.Words.apply(lambda x: 'Seed' if x.lower() == seed else 'Alternative')
    return df

In [18]:
panic = cf_from_wordnet_df(seed,text)

NOUN


  df["Word Similarity"] = df.Words.apply(lambda x: seed_token.similarity(nlp(x)))


In [24]:
panic.head()

Unnamed: 0,Categories,Words,Sentences,Word Similarity,Prediction,Seed
0,movie,movie,This movie was filmed in Iraq.,0.519086,-0.985851,Alternative
1,movie,film,This film was filmed in Iraq.,1.0,-0.976839,Seed
2,movie,picture,This picture was filmed in Iraq.,0.275934,-0.966598,Alternative
3,movie,moving picture,This moving picture was filmed in Iraq.,0.317025,0.951934,Alternative
4,movie,moving-picture show,This moving-picture show was filmed in Iraq.,0.438731,-0.891211,Alternative


In [19]:
single_nearest = alt.selection_single(on='mouseover', nearest=True)
full = alt.Chart(panic).encode(
    alt.X('Word Similarity:Q'),  # specify nominal data
    alt.Y('Prediction:Q'),  # specify quantitative data
    color=alt.Color('Seed:N', legend=alt.Legend(title="Seed or Alternative")),
    size='Seed:N',
    tooltip=('Words','Prediction','Word Similarity')
).mark_circle(opacity=.5).properties(width=300).add_selection(single_nearest)

full

In [20]:
isinstance(cf_df, pd.DataFrame)

True

In [21]:
#https://github.com/tvst/st-annotated-text/blob/master/example.py

In [35]:
def get_sampled(df, seed, fixed=False):
    sub_df = df[df['Words'] != seed]
    if fixed:
        sample = sub_df.sample(n=2, random_state = 2052)
    else:
        sample = sub_df.sample(n=2)
    text2 = sample.Sentences.iloc[0]
    text3 = sample.Sentences.iloc[1]
    return text2,text3

In [26]:
get_sampled(panic,"film")

('This cheesecake was filmed in Iraq.', 'This scum was filmed in Iraq.')

In [27]:
text2, text3 = get_sampled(panic, "film")

In [28]:
text2

'This montage was filmed in Iraq.'

In [107]:
#inspired by https://stackoverflow.com/questions/17758023/return-rows-in-a-dataframe-closest-to-a-user-defined-number/17758115#17758115
def abs_dif(df,seed):
    target = df[df['Words'] == seed].Prediction.iloc[0]
    sub_df = df[df['Words'] != seed].reset_index()
    nearest_prediction = sub_df.Prediction[(sub_df.Prediction-target).abs().argsort()[:1]]
    farthest_prediction = sub_df.Prediction[(sub_df.Prediction-target).abs().argsort()[-1:]]
    nearest = sub_df.Sentences.iloc[nearest_prediction.index[0]]
    farthest = sub_df.Sentences.iloc[farthest_prediction.index[0]]
    return target, nearest, farthest

In [108]:
target, near, far = abs_dif(panic,"film")

In [100]:
near

'This abstraction was filmed in Iraq.'

In [101]:
far

'This positive was filmed in Iraq.'

In [102]:
eval_pred_test(near)

-0.9771453142166138

In [103]:
eval_pred_test(far)

0.9987342953681946

In [104]:
target

-0.9768388867378235