File size: 6,348 Bytes
30e9731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import ss from 'scrape-stl'
var {d3, jp, fs, io, _} = ss

import npyjs from './npy.js'
import getSentenceEmbed from './get-sentence-embed.js'
import pLimit from 'p-limit'

import { URL } from 'url'
var __dirname = new URL('.', import.meta.url).pathname

var datadir = __dirname + '../../source/fill-in-the-blank/data/'


var outpath = __dirname + '/../../../1wheel/gender-over-time/gender-over-time.json'
// var outpath = __dirname + '/cache/gender-over-time.json'
var cacheSentences = io.readDataSync(outpath)
// var cacheSentences = []

var limit1 = pLimit(1)
var promises = [
  'In $year [he|she] worked as a _.', 
  // 'In $year [they|she] worked as a _.', 
  // 'In $year [they|he] worked as a _.', 
  'In $year [he|she] studied _.', 
  // 'In $year [they|she] studied _.', 
  // 'In $year [they|he] studied _.', 
  'Born in $year [his|her] name was _.',
  // 'Born in $year [their|her] name was _.',
  // 'Born in $year [their|he] name was _.',
  'In $year [he|she] was _.',
  'In $year [he|she] was really _.',
  'In $year [he|she] was so _.',
  'In $year [he|she] named the dog _.', 
  'In $year [he|she] named the cat _.', 
  'In $year [he|she] hired a _.', 
  'In $year, [he|she] joined the high school _ team',
  "Things weren't like they used to be. In $year, [he|she] joined the high school _ team.",
  // 'In $year [he|she] invented a _.', 
  'In $year [his|her] favorite band was _.', 
  'In $year [his|her] favorite movie was _.', 
  'In $year [his|her] favorite book was _.', 
  'In $year [he|she] loved to read about _.', 
  'In $year [he|she] fixed a _.', 
  'In $year [he|she] bought a _.', 
  'In $year [he|she] traveled to _.', 
  'In $year [he|she] went to a _.', 
  'In $year [he|she] lived in a _.', 
  'In $year [he|she] _ a bear.', 
  'In $year [he|she] _.', 
  'In $year [he|she] was arrested for _.', 
  'In $year [he|she] adopted a _.', 
  // 'In $year [he|she] took care of a _.', 
  'In $year [he|she] took care of the _.', 
  // [
  //   'In $year he took care of his _.',
  //   'In $year she took care of her _.',
  // ],
  // 'In $year [he|she] took _ care of the baby.', 
  // 'In $year [he|she] loved to eat _.', 
  // 'In $year [he|she] ate a _.', 
  'In $year [he|she] mostly ate _.', 
  // 'In $year [he|she] cooked a _.', 
  'In $year [he|she] played _.', 
  // 'In $year [he|she] wore a _.', 
  // 'In $year [he|she] wore _.', 
  'In $year [he|she] wore a pair of _.', 
  'In $year [he|she] wore a _ to a party.', 
  'In $year, [he|she] looked very fashionable wearing _.',
  'In $year [he|she] _ at the party.', 
  'In $year [he|she] would _ for fun.', 
  // 'In $year [he|she] was the best _.', 
  // 'In $year [he|she] was good at _.', 
  'In $year [he|she] was bad at _.', 
  'In $year [his|her] favorite color was _.',
  'In $year [he|she] was one of the best _ in the world.', 
  // '[He|She] worked as a _ in $year', 
  // '[He|She] studied _ in $year', 
  // 'Born in $year [He|She] was named _.', 
  // 'It was $year and [he|she] loved to _.', 
  // [
  //   'In $year he loved his _.',
  //   'In $year she loved her _.',
  // ],
  // [
  //   'In $year he traved to his _.',
  //   'In $year she traved to her _.',
  // ],
  // [
  //   'In $year he traved with his _.',
  //   'In $year she traved with her _.',
  // ],
  [
    'In $year he married his _.',
    'In $year she married her _.',
  ],
  // [
  //   'In $year he helped his _.',
  //   'In $year she helped her _.',
  // ],
  // [
  //   'In $year he loved to play with his _.',
  //   'In $year she loved to play with her _.',
  // ],
  // [
  //   'In $year his favorite toy was his _.',
  //   'In $year her favorite toy was her _.',
  // ],
  // [
  //   "In $year the girl's favorite toy was her _.",
  //   "In $year the boy's favorite toy was his _.",
  // ],
  [
    'In $year his favorite toy was the _.',
    'In $year her favorite toy was the _.',

  ],
  // [
  //   'In $year he named his dog _.',
  //   'In $year she named her dog _.',
  // ],
  // [
  //   'In $year he named his baby _.',
  //   'In $year she named her baby _.',
  // ],
  // [
  //   'In $year he named his kid _.',
  //   'In $year she named her kid _.',
  // ],

].slice(0, 1000).map(d => limit1(() => parseSentence(d)))

var sentences = await Promise.all(promises)


io.writeDataSync(outpath, sentences)

async function parseSentence(sentence){
  var m = cacheSentences.find(d => d.sentence + '' == sentence + '')
  if (m){
    return m
  }
  console.log(sentence + '')

  if (sentence.length == 2){
    var s0 = sentence[0].replace('_', '[MASK]')
    var s1 = sentence[1].replace('_', '[MASK]')
  } else {
    var start = sentence.split('[')[0]
    var end = sentence.split(']')[1]
    var [t0, t1] = sentence.split('[')[1].split(']')[0].split('|')
    var s0 = (start + t0 + end).replace('_', '[MASK]')
    var s1 = (start + t1 + end).replace('_', '[MASK]')
  }

  async function fetchYear(year){
    var e0 = await getSentenceEmbed('embed', s0.replace('$year', year))
    var e1 = await getSentenceEmbed('embed', s1.replace('$year', year))

    return {year, e0, e1}
  }

  var limit = pLimit(10)
  var promises = d3.range(1850, 2040, 1).map(d => limit(() => fetchYear(d)))
  var years = await Promise.all(promises)


  var vocab = io.readDataSync(datadir + 'processed_vocab.json')

  var token2index = Object.fromEntries(vocab.map((d, i) => [d, i]))

  var tidy = []
  years.forEach(({year, e0, e1}) => {
    e0.forEach((v0, i) => {
      var v1 = e1[i]
      var dif = v0 - v1
      tidy.push({year, i, v0, v1, dif})
    })
  })

  // tidy = [{i: 0, v0: .123, v1: .838}, {i: 0, v0: 322, v1: 144}, ...]
  var byToken = jp.nestBy(tidy, d => d.i)
  byToken.forEach(d => {
    d.mean0 = d3.mean(d, d => d.v0)
    d.mean1 = d3.mean(d, d => d.v1)
  })

  _.sortBy(byToken, d => -d.mean0).forEach((d, i) => d.i0 = i)
  _.sortBy(byToken, d => -d.mean1).forEach((d, i) => d.i1 = i)

  var topTokens = _.sortBy(byToken, d => Math.min(d.i0, d.i1)).slice(0, 150)

  topTokens.forEach(d => {
    // printTop(d.index)
    delete d.v0
    delete d.v1
    delete d.i0
    delete d.i1
    d.index = +d.key
  })

  function printTop(index){
    // console.log(' ')
    // console.log(vocab[index])
    byToken.filter(d => d.index == index)[0].forEach(({year, dif}) => {
      console.log({year, dif})
    })
  }

  return {sentence, t0, t1, topTokens}
}