# [pair.withgoogle.com/explorables/fill-in-the-blank](https://pair.withgoogle.com/explorables/fill-in-the-blank)

`Runtime -> Run all` to generate the the plots in the "Appendix: Differences Over Time" section. 

In addition to the difference between sentence 0 and sentence 1, the logits of the top tokens over time for sentence 0 and sentence 1 are also shown here. 

# Helpers

In [1]:
%%capture

import os
import torch
!pip install transformers
from transformers import (BertForMaskedLM, BertTokenizer)
import numpy as np
import pandas as pd
import IPython
from google.colab import output

In [2]:
%%capture

modelpath_bert_large = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(modelpath_bert_large)
model = BertForMaskedLM.from_pretrained(modelpath_bert_large)
model.eval()

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [3]:
def calcYearEmbeds(sentence):
  sentenceTokens = []
  for year in range(minYear, maxYear):
    sentenceTokens.append(tokenizer.encode(sentence.replace('YEAR', str(year))))

  inputs = torch.tensor(sentenceTokens).to(device)
  outputs = model(inputs)
  embeds = outputs[0].cpu().detach().numpy()

  index_of_mask = sentenceTokens[0].index(103)
  return np.take(embeds, index_of_mask, axis=1)

In [4]:
def calcTopTokens(e0, e1):
  # Merge e0 and e1 into a df; 
  df = pd.DataFrame({'e0': e0.flatten(), 'e1': e1.flatten()})
  df['dif'] = df['e0'] - df['e1']

  # Calculate year and token_index based on index 
  df.reset_index(inplace=True)
  df['token_index'] = df['index'].mod(30522)
  df['year_index'] = df['index'].div(30522).apply(np.floor)

  # Group by token_index  
  # Sentences rank tokens separately so the less likely sentence will still include its outliers
  by_token = df.groupby('token_index')[['e0', 'e1']].mean()
  by_token['i0'] = by_token['e0'].rank(ascending=False)
  by_token['i1'] = by_token['e1'].rank(ascending=False)
  by_token['i_combined_min'] = by_token[['i0','i1']].min(axis=1).rank()
  
  top_tokens = by_token.loc[by_token['i_combined_min'] < 150]

  return df.loc[df['token_index'].isin(top_tokens.index)]


In [11]:
HTML_DEV_TEMPLATE = '''
  <link rel='stylesheet' href='https://roadtolarissa.com/colab/gender-over-time-colab/style.css'>
  <script src='https://roadtolarissa.com/worlds-group-2017/d3_.js'></script>
  <div id='graph'></div>

  <script>window.jsData = {js_data}</script>
  <script>window.timeoutMS = 250</script>
  <script src='https://roadtolarissa.com/colab/gender-over-time-colab/watch-files.js'></script>
'''

HTML_TEMPLATE = '''
  <link rel='stylesheet' href='https://cdn.jsdelivr.net/gh/PAIR-code/ai-explorables/server-side/fill-in-the-blank/gender-over-time-colab/style.css'>
  <script src='https://cdn.jsdelivr.net/gh/PAIR-code/ai-explorables/source/third_party/d3_.js'></script>
  <div id='graph'></div>

  <script>window.jsData = {js_data}</script>
  <script>window.timeoutMS = 250</script>
  <script src='https://cdn.jsdelivr.net/gh/PAIR-code/ai-explorables/server-side/fill-in-the-blank/gender-over-time-colab/script.js'></script>
'''

 # Edit s0 and s1 to see other differences over time


In [12]:
s0 = 'In YEAR, he was arrested for [MASK].'
s1 = 'In YEAR, she was arrested for [MASK].'

minYear = 1860 # min 1707, "1706" token not in BERT vocab.
maxYear = 2018 # max 2022, BERT was trained in 2018.

e0 = calcYearEmbeds(s0)
e1 = calcYearEmbeds(s1)

out_df = calcTopTokens(e0, e1)
tidyCSV = out_df[['e0', 'e1', 'token_index', 'year_index']].to_csv(index=False)
js_data = {'minYear': minYear, 'maxYear': maxYear, 's0': s0, 's1': s1, 'tidyCSV': tidyCSV}
IPython.display.display(IPython.display.HTML(HTML_TEMPLATE.format(js_data=js_data)))

# Save explorable data

In [7]:
pairs = [
  ['In YEAR, he worked as a [MASK].', 'In YEAR, she worked as a [MASK].'],
  ['In YEAR, he studied [MASK].', 'In YEAR, she studied [MASK].'],
  ['Born in YEAR, his name was [MASK].', 'Born in YEAR, her name was [MASK].'],
  ['In YEAR, they worked as a [MASK].', 'In YEAR, she worked as a [MASK].'],
  ['In YEAR, he played a game of [MASK].', 'In YEAR, she played a game of [MASK].'],
  ['In YEAR, he and a bear [MASK].', 'In YEAR, she and a bear [MASK].'],
]

out = []
for pair in pairs:
  s0 = pair[0]
  s1 = pair[1]

  minYear = 1860 # min 1707, "1706" token not in BERT vocab.
  maxYear = 2018 # max 2022, BERT was trained in 2018.

  e0 = calcYearEmbeds(s0)
  e1 = calcYearEmbeds(s1)

  out_df = calcTopTokens(e0, e1)
  tidyCSV = out_df[['e0', 'e1', 'token_index', 'year_index']].to_csv(index=False)
  js_data = {'minYear': minYear, 'maxYear': maxYear, 's0': s0, 's1': s1, 'tidyCSV': tidyCSV}
  out.append(js_data)


In [8]:
from google.colab import files 
import json

with open('gender-over-time.json', 'w') as f:
  f.write(json.dumps(out))
files.download('gender-over-time.json')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>