Daniel-Saeedi commited on
Commit
95b1b47
1 Parent(s): 2f7cc88

data and utils

Browse files
data/definitional_pairs.json ADDED
@@ -0,0 +1 @@
 
1
+ [["woman", "man"], ["girl", "boy"], ["she", "he"], ["mother", "father"], ["daughter", "son"], ["gal", "guy"], ["female", "male"], ["her", "his"], ["herself", "himself"], ["Mary", "John"]]
data/equalize_pairs.json ADDED
@@ -0,0 +1 @@
 
1
+ [["monastery", "convent"], ["spokesman", "spokeswoman"], ["Catholic_priest", "nun"], ["Dad", "Mom"], ["Men", "Women"], ["councilman", "councilwoman"], ["grandpa", "grandma"], ["grandsons", "granddaughters"], ["prostate_cancer", "ovarian_cancer"], ["testosterone", "estrogen"], ["uncle", "aunt"], ["husbands", "wives"], ["Father", "Mother"], ["Grandpa", "Grandma"], ["He", "She"], ["boy", "girl"], ["boys", "girls"], ["brother", "sister"], ["brothers", "sisters"], ["businessman", "businesswoman"], ["chairman", "chairwoman"], ["colt", "filly"], ["congressman", "congresswoman"], ["dad", "mom"], ["dads", "moms"], ["dudes", "gals"], ["ex_boyfriend", "ex_girlfriend"], ["father", "mother"], ["fatherhood", "motherhood"], ["fathers", "mothers"], ["fella", "granny"], ["fraternity", "sorority"], ["gelding", "mare"], ["gentleman", "lady"], ["gentlemen", "ladies"], ["grandfather", "grandmother"], ["grandson", "granddaughter"], ["he", "she"], ["himself", "herself"], ["his", "her"], ["king", "queen"], ["kings", "queens"], ["male", "female"], ["males", "females"], ["man", "woman"], ["men", "women"], ["nephew", "niece"], ["prince", "princess"], ["schoolboy", "schoolgirl"], ["son", "daughter"], ["sons", "daughters"], ["twin_brother", "twin_sister"]]
data/female_word_file.txt ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ countrywoman
2
+ sororal
3
+ witches
4
+ maidservant
5
+ mothers
6
+ diva
7
+ actress
8
+ spinster
9
+ mama
10
+ duchesses
11
+ barwoman
12
+ countrywomen
13
+ dowry
14
+ hostesses
15
+ suitors
16
+ airwomen
17
+ menopause
18
+ clitoris
19
+ princess
20
+ governesses
21
+ abbess
22
+ women
23
+ widow
24
+ ladies
25
+ sorceresses
26
+ madam
27
+ brides
28
+ baroness
29
+ housewives
30
+ godesses
31
+ niece
32
+ widows
33
+ lady
34
+ sister
35
+ brides
36
+ nun
37
+ adultresses
38
+ obstetrics
39
+ bellgirls
40
+ her
41
+ marchioness
42
+ princesses
43
+ empresses
44
+ mare
45
+ chairwoman
46
+ convent
47
+ priestesses
48
+ girlhood
49
+ ladies
50
+ queen
51
+ gals
52
+ mommies
53
+ maid
54
+ female_ejaculation
55
+ spokeswoman
56
+ seamstress
57
+ cowgirls
58
+ chick
59
+ spinsters
60
+ hair_salon
61
+ empress
62
+ mommy
63
+ feminism
64
+ gals
65
+ enchantress
66
+ gal
67
+ motherhood
68
+ estrogen
69
+ camerawomen
70
+ godmother
71
+ strongwoman
72
+ goddess
73
+ matriarch
74
+ aunt
75
+ chairwomen
76
+ ma'am
77
+ sisterhood
78
+ hostess
79
+ estradiol
80
+ wife
81
+ mom
82
+ stewardess
83
+ females
84
+ viagra
85
+ spokeswomen
86
+ ma
87
+ belle
88
+ minx
89
+ maiden
90
+ witch
91
+ miss
92
+ nieces
93
+ mothered
94
+ cow
95
+ belles
96
+ councilwomen
97
+ landlords
98
+ granddaughter
99
+ fiancees
100
+ stepmothers
101
+ horsemen
102
+ grandmothers
103
+ adultress
104
+ schoolgirl
105
+ hen
106
+ granddaughters
107
+ bachelorette
108
+ camerawoman
109
+ moms
110
+ her
111
+ mistress
112
+ lass
113
+ policewoman
114
+ nun
115
+ actresses
116
+ saleswomen
117
+ girlfriend
118
+ councilwoman
119
+ lady
120
+ stateswoman
121
+ maternal
122
+ lass
123
+ landlady
124
+ sistren
125
+ ladies
126
+ wenches
127
+ sorority
128
+ bellgirl
129
+ duchess
130
+ ballerina
131
+ chicks
132
+ fiancee
133
+ fillies
134
+ wives
135
+ suitress
136
+ paternity
137
+ she
138
+ businesswoman
139
+ masseuses
140
+ heroine
141
+ doe
142
+ busgirls
143
+ girlfriends
144
+ queens
145
+ sisters
146
+ mistresses
147
+ stepmother
148
+ daughter
149
+ minxes
150
+ cowgirl
151
+ lady
152
+ daughters
153
+ mezzo
154
+ saleswoman
155
+ mistress
156
+ hostess
157
+ nuns
158
+ maids
159
+ mrs.
160
+ headmistresses
161
+ lasses
162
+ congresswoman
163
+ airwoman
164
+ housewife
165
+ priestess
166
+ barwomen
167
+ barnoesses
168
+ abbesses
169
+ handywoman
170
+ toque
171
+ sororities
172
+ stewardesses
173
+ filly
174
+ czarina
175
+ stepdaughters
176
+ herself
177
+ girls
178
+ lionesses
179
+ lady
180
+ vagina
181
+ hers
182
+ masseuse
183
+ cows
184
+ aunts
185
+ wench
186
+ toques
187
+ wife
188
+ lioness
189
+ sorceress
190
+ effeminate
191
+ mother
192
+ lesbians
193
+ female
194
+ waitresses
195
+ ovum
196
+ skene_gland
197
+ stepdaughter
198
+ womb
199
+ businesswomen
200
+ heiress
201
+ waitress
202
+ headmistress
203
+ woman
204
+ governess
205
+ godess
206
+ bride
207
+ grandma
208
+ bride
209
+ gal
210
+ lesbian
211
+ ladies
212
+ girl
213
+ grandmother
214
+ mare
215
+ hens
216
+ uterus
217
+ nuns
218
+ maidservants
219
+ seamstress'
220
+ busgirl
221
+ heroines
data/gender_specific_full.json ADDED
@@ -0,0 +1 @@
 
1
+ ["he", "his", "He", "her", "she", "him", "She", "man", "women", "men", "His", "woman", "spokesman", "wife", "himself", "son", "mother", "father", "chairman", "daughter", "husband", "guy", "girls", "girl", "Her", "boy", "King", "boys", "brother", "Chairman", "spokeswoman", "female", "sister", "Women", "Man", "male", "herself", "Lions", "Lady", "brothers", "dad", "actress", "mom", "sons", "girlfriend", "Kings", "Men", "daughters", "Prince", "Queen", "teenager", "lady", "Bulls", "boyfriend", "sisters", "Colts", "mothers", "Sir", "king", "businessman", "Boys", "grandmother", "grandfather", "deer", "cousin", "Woman", "ladies", "Girls", "Father", "uncle", "PA", "Boy", "Councilman", "mum", "Brothers", "MA", "males", "Girl", "Mom", "Guy", "Queens", "congressman", "Dad", "Mother", "grandson", "twins", "bull", "queen", "businessmen", "wives", "widow", "nephew", "bride", "females", "aunt", "Congressman", "prostate_cancer", "lesbian", "chairwoman", "fathers", "Son", "moms", "Ladies", "maiden", "granddaughter", "younger_brother", "Princess", "Guys", "lads", "Ma", "Sons", "lion", "Bachelor", "gentleman", "fraternity", "bachelor", "niece", "Lion", "Sister", "bulls", "husbands", "prince", "colt", "salesman", "Bull", "Sisters", "hers", "dude", "Spokesman", "beard", "filly", "Actress", "Him", "princess", "Brother", "lesbians", "councilman", "actresses", "Viagra", "gentlemen", "stepfather", "Deer", "monks", "Beard", "Uncle", "ex_girlfriend", "lad", "sperm", "Daddy", "testosterone", "MAN", "Female", "nephews", "maid", "daddy", "mare", "fiance", "Wife", "fiancee", "kings", "dads", "waitress", "Male", "maternal", "heroine", "feminist", "Mama", "nieces", "girlfriends", "Councilwoman", "sir", "stud", "Mothers", "mistress", "lions", "estranged_wife", "womb", "Brotherhood", "Statesman", "grandma", "maternity", "estrogen", "ex_boyfriend", "widows", "gelding", "diva", "teenage_girls", "nuns", "Daughter", "czar", "ovarian_cancer", "HE", "Monk", "countrymen", "Grandma", "teenage_girl", "penis", "bloke", "nun", "Husband", "brides", "housewife", "spokesmen", "suitors", "menopause", "monastery", "patriarch", "Beau", "motherhood", "brethren", "stepmother", "Dude", "prostate", "Moms", "hostess", "twin_brother", "Colt", "schoolboy", "eldest", "brotherhood", "Godfather", "fillies", "stepson", "congresswoman", "Chairwoman", "Daughters", "uncles", "witch", "Mommy", "monk", "viagra", "paternity", "suitor", "chick", "Pa", "fianc\u00e9", "sorority", "macho", "Spokeswoman", "businesswoman", "eldest_son", "gal", "statesman", "schoolgirl", "fathered", "goddess", "hubby", "mares", "stepdaughter", "blokes", "dudes", "socialite", "strongman", "Witch", "fianc\u00e9e", "uterus", "grandsons", "Bride", "studs", "mama", "Aunt", "godfather", "hens", "hen", "mommy", "Babe", "estranged_husband", "Fathers", "elder_brother", "boyhood", "baritone", "Diva", "Lesbian", "grandmothers", "grandpa", "boyfriends", "feminism", "countryman", "stallion", "heiress", "queens", "Grandpa", "witches", "aunts", "semen", "fella", "granddaughters", "chap", "knight", "widower", "Maiden", "salesmen", "convent", "KING", "vagina", "beau", "babe", "HIS", "beards", "handyman", "twin_sister", "maids", "gals", "housewives", "Gentlemen", "horsemen", "Businessman", "obstetrics", "fatherhood", "beauty_queen", "councilwoman", "princes", "matriarch", "colts", "manly", "ma", "fraternities", "Spokesmen", "pa", "fellas", "Gentleman", "councilmen", "dowry", "barbershop", "Monks", "WOMAN", "fraternal", "ballerina", "manhood", "Dads", "heroines", "granny", "gynecologist", "princesses", "Goddess", "yo", "Granny", "knights", "eldest_daughter", "HER", "underage_girls", "masculinity", "Girlfriend", "bro", "Grandmother", "grandfathers", "crown_prince", "Restless", "paternal", "Queen_Mother", "Boyfriend", "womens", "Males", "SHE", "Countess", "stepchildren", "Belles", "bachelors", "matron", "momma", "Legs", "maidens", "goddesses", "landlady", "sisterhood", "Grandfather", "Fraternity", "Majesty", "Babes", "lass", "maternal_grandmother", "blondes", "ma'am", "Womens", "divorcee", "Momma", "fathering", "Effie", "Lad", "womanhood", "missus", "Sisterhood", "granddad", "Mens", "papa", "gf", "sis", "Husbands", "Hen", "womanizer", "gynecological", "stepsister", "Handsome", "Prince_Charming", "BOY", "stepdad", "teen_ager", "GIRL", "dame", "Sorority", "beauty_pageants", "raspy", "harem", "maternal_grandfather", "Hes", "deliveryman", "septuagenarian", "damsel", "paternal_grandmother", "paramour", "paternal_grandparents", "Nun", "DAD", "mothering", "shes", "HE_'S", "Nuns", "teenage_daughters", "auntie", "widowed_mother", "Girlfriends", "FATHER", "virile", "COUPLE", "grandmas", "Hubby", "nan", "vixen", "Joan_Crawford", "stepdaughters", "endometrial_cancer", "stepsons", "loins", "Grandson", "Mitchells", "erections", "Matron", "Fella", "daddies", "ter", "Sweetie", "Dudes", "Princesses", "Lads", "lioness", "Mamma", "virility", "bros", "womenfolk", "Heir", "BROTHERS", "manliness", "patriarchs", "earl", "sisterly", "Whore", "Gynaecology", "countess", "convents", "Oratory", "witch_doctor", "mamas", "yah", "aunty", "aunties", "Heiress", "lasses", "Breasts", "fairer_sex", "sorority_sisters", "WIFE", "Laurels", "penile", "nuh", "mah", "toms", "mam", "Granddad", "premenopausal_women", "Granddaddy", "nana", "coeds", "dames", "herdsman", "Mammy", "Fellas", "Niece", "menfolk", "Grandad", "bloods", "Gramps", "damsels", "Granddaughter", "mamma", "concubine", "Oros", "Blarney", "filial", "broads", "Ethel_Kennedy", "ACTRESS", "Tit", "fianc", "Hunk", "Night_Shift", "wifey", "Lothario", "Holy_Roman_Emperor", "horse_breeder", "grandnephew", "Lewises", "Muscular", "feminist_movement", "Sanan", "women\u00e2_\u20ac_\u2122", "Fiancee", "dowries", "Carmelite", "rah", "n_roller", "bay_filly", "belles", "Uncles", "PRINCESS", "womans", "Homeboy", "Blokes", "Charmer", "codger", "Delta_Zeta", "courtesans", "grandaughter", "SISTER", "Highness", "grandbabies", "crone", "Skip_Away", "noblewoman", "bf", "jane", "philandering_husband", "Sisqo", "mammy", "daugher", "director_Skip_Bertman", "DAUGHTER", "Royal_Highness", "mannish", "spinsters", "Missus", "madame", "Godfathers", "saleswomen", "beaus", "Risha", "luh", "sah", "negligee", "Women\u00e2_\u20ac_\u2122", "Hos", "salesgirl", "grandmom", "Grandmas", "Lawsons", "countrywomen", "Booby", "darlin", "Sheiks", "boyz", "wifes", "Bayi", "Il_Duce", "\u00e2_\u20ac_\u0153My", "fem", "daugther", "Potti", "hussy", "tch", "Gelding", "stemmed_roses", "Damson", "puh", "Tylers", "neice", "Mutha", "GRANDMOTHER", "youse", "spurned_lover", "mae", "Britt_Ekland", "clotheshorse", "Carlita_Kilpatrick", "Cambest", "Pretty_Polly", "banshees", "male_chauvinist", "Arliss", "mommas", "maidservant", "Gale_Harold", "Little_Bo_Peep", "Cleavers", "hags", "blowsy", "Queen_Elizabeth_I.", "lassies", "papas", "BABE", "ugly_ducklings", "Jims", "hellion", "Beautician", "coalminer", "relaxin", "El_Mahroug", "Victoria_Secret_Angel", "shepherdess", "Mosco", "Slacks", "nanna", "wifely", "tomboys", "LAH", "hast", "apo", "Kaplans", "milkmaid", "Robin_Munis", "John_Barleycorn", "royal_highness", "Meanie", "NAH", "trollop", "roh", "Jewess", "Sheik_Hamad", "mumsy", "Big_Pussy", "chil_dren", "Aunt_Bea", "basso", "sista", "girlies", "nun_Sister", "chica", "Bubbas", "massa", "Southern_belles", "Nephews", "castrations", "Mister_Ed", "Grandsons", "Calaf", "Malachy_McCourt", "Shamash", "hey_hey", "Harmen", "sonofabitch", "Donovans", "Grannie", "Kalinka", "hisself", "Devean", "goatherd", "hinds", "El_Corredor", "Kens", "notorious_womanizer", "goh", "Mommas", "washerwoman", "Samaira", "Coo_Coo", "Governess", "grandsire", "PRINCE_WILLIAM", "gramma", "him.He", "Coptic_priest", "Corbie", "Kennys", "thathe", "Pa_Pa", "Bristols", "Hotep", "snowy_haired", "El_Prado_Ire", "Girl_hitmaker", "Hurleys", "St._Meinrad", "sexually_perverted", "authoress", "Prudie", "raven_haired_beauty", "Bonos", "domestic_shorthair", "brothas", "nymphet", "Neelma", "Seita", "stud_muffin", "St._Judes", "yenta", "bare_shouldered", "Pinkney_Sr.", "PRINCE_CHARLES", "Bisutti", "sistas", "Blanche_Devereaux", "Momoa", "Quiff", "Scotswoman", "balaclava_clad_men", "Louis_Leakey", "dearie", "vacuum_cleaner_salesman", "grandads", "postulant", "SARAH_JESSICA_PARKER", "AUNT", "Prince_Dauntless", "Dalys", "Darkie", "Czar_Nicholas", "Lion_Hearted", "Boy_recliner", "baby_mamas", "giantess", "Lawd", "GRANNY", "fianc_e", "Bilqis", "WCTU", "famly", "Ellas", "feminazis", "Pentheus", "MAMAS", "Town_Criers", "Saggy", "youngman", "grandam", "divorc\u00e9", "bosomed", "roon", "Simmentals", "eponymous_heroine", "LEYLAND", "REE'", "cain't", "Evelynn", "WAH'", "sistah", "Horners", "Elsie_Poncher", "Coochie", "rat_terriers", "Limousins", "Buchinski", "Schicchi", "Carpitcher", "Khwezi", "HAH'", "Shazza", "Mackeson", "ROH'", "kuya", "novice_nun", "Shei", "Elmasri", "ladykiller", "6yo", "Yenta", "SHEL", "pater", "Souse", "Tahirah", "comedian_Rodney_Dangerfield", "Shottle", "carryin", "Sath", "fa'afafine", "royal_consort", "hus_band", "maternal_uncles", "dressing_provocatively", "dreamgirl", "millionaire_industrialist", "Georgie_Girl", "Must_Be_Obeyed", "joh", "Arabian_stallion", "ahr", "mso_para_margin_0in", "SOO'", "Biddles", "Chincoteague_Volunteer_Fire", "Lisa_Miceli", "gorgeous_brunette", "fianc\u017d", "Moved_fluently", "Afternoon_Deelites", "biker_dude", "Vito_Spatafore", "MICK_JAGGER", "Adesida", "Reineman", "witz", "Djamila", "Glenroe", "daddys", "Romanzi", "gentlewomen", "Dandie_Dinmont_terrier", "Excess_Ire", "By_SYVJ_Staff", "zan", "CONFESSIONS", "Magees", "wimmin", "tash", "Theatrical_Ire", "Prince_Charmings", "chocolate_eclair", "bron", "daughers", "Felly", "fiftyish", "Spritely", "GRANDPA", "distaffer", "Norbertines", "DAH'", "leader_Muammar_Gadaffi", "swains", "Prince_Tomohito", "Honneur", "Soeur", "jouster", "Pharaoh_Amenhotep_III", "QUEEN_ELIZABETH_II", "Ne'er", "Galileo_Ire", "Fools_Crow", "Lannisters", "Devines", "gonzales", "columnist_Ann_Landers", "Moseleys", "hiz", "busch", "roastee", "toyboys", "Sheffields", "grandaunt", "Galvins", "Giongo", "geh", "flame_haired_actress", "Grammarian", "Greg_Evigan", "frontierswoman", "Debele", "rabs", "nymphets", "aai", "BREE", "Shaqs", "ZAY", "pappa", "Housa", "refrigerator_repairman", "artificial_inseminations", "chickie", "Rippa", "teenager_Tracy_Turnblad", "homebred_colt", "Abigaille", "hen_pecked_husband", "businesman", "her.She", "Kaikeyi", "Stittsworth", "self_proclaimed_redneck", "Khella", "NeW", "Evers_Swindell", "Asmerom_Gebreselassie", "Boy_recliners", "Cliff_Claven", "Legge_Bourke", "Costos", "d'_honneur", "sistahs", "Cabble", "sahn", "CROW_AGENCY_Mont", "jezebel", "Harrolds", "ROSARIO_DAWSON", "INXS_frontman_Michael_Hutchence", "Gursikh", "Dadas", "VIAGA", "keen_horsewoman", "Theodoric", "Eldery", "lihn", "Alice_Kramden", "Santarina", "radical_cleric_al_Sadr", "Curleys", "SY'", "Fidaa", "Saptapadi", "Actor_Sean_Astin", "Kellita_Smith", "Doly", "Libertina", "Money_McBags", "Chief_Bearhart", "choirgirl", "chestnut_stallion", "VIGRA", "BY_JIM_McCONNELL", "Sal_Vitale", "Trivia_buffs", "kumaris", "fraternal_lodge", "galpals", "Borino_Quinn", "lina", "LATEST_Rapper", "Bezar", "Manro", "bakla", "Grisetti", "blond_bimbo", "spinster_aunt", "gurls", "hiswife", "paleface", "Charlye", "hippie_chicks", "Khalifas", "Picture_JUSTIN_SANSON", "Hepburns", "yez", "ALDER", "Sanussi", "Lil_Sis", "McLoughlins", "Barbra_Jean", "Lulua", "thatshe", "actress_Shohreh_Aghdashloo", "SIR_ANTHONY_HOPKINS", "Gloddy", "ZAH'", "ORANGE_'S", "Danielle_Bimber", "grandmum", "Kulkis", "Brazington", "Marisa_Lenhard_CFA", "SIR_JOHN", "Clareman", "Aqila", "Heavily_tattooed", "Libbys", "thim", "elocutionist", "submissives", "Inja", "rahm", "Agnes_Gooch", "fake_tits", "nancy_boys", "Swaidan", "SHAH'", "ain'ta_bed", "Shumail_Raj", "Duchesse", "diethylstilbestrol_DES", "colt_foal", "unfaithful_lover", "Maseri", "nevah", "SAHN", "Barths", "Toughkenamon", "GUEST_STARS", "him.But", "Donna_Claspell", "gingham_dresses", "Massage_Parlour", "wae", "Wasacz", "Magistra", "vihl", "Smriti_Iraani", "boyish_haircut", "workingwoman", "borthers", "Capuchin_friars", "Nejma", "yes_sirs", "bivocational_pastor", "Grafters", "HOPWOOD", "Nicknamed_Godzilla", "yos", "Berkenfield", "Missis", "sitcom_Designing_Women", "Kafoa", "trainer_Emma_Lavelle", "sadomasochistic_dungeon", "iht", "desperates", "predessor", "wolf_cub", "indigenous_Peruvians", "Livia_Soprano", "troh", "colt_sired", "BOND_HILL", "ihl", "Drydens", "rahs", "Piserchia", "Sonny_Corinthos", "bankrobber", "Fwank", "feisty_redhead", "booze_guzzling", "COOPERS", "actress_Q'orianka_Kilcher", "Cortezar", "twe", "Jacoub", "Cindy_Iannarelli", "Hell_Raiser", "Fondly_referred", "Bridal_Shoppe", "Noleta", "Christinas", "IAGRA", "LaTanya_Richardson", "Sang_Bender", "Assasins", "sorrel_gelding", "septugenarian", "Hissy", "Muqtada_al_Sadr_mook", "Pfeni", "MADRID_AFX_Banco_Santander", "tuchis", "LeVaughn", "Gadzicki", "transvestite_hooker", "Fame_jockey_Laffit", "nun_Sister_Mary", "SAMSONOV", "Mayflower_Madam", "Shaque", "well.He", "Trainer_Julio_Canani", "sorrel_mare", "minivehicle_joint_venture", "wife_Dwina", "Aasiya_AH'_see", "Baratheon", "Rick_O'Shay", "Mammies", "goatie", "Nell_Gwynne", "charmingly_awkward", "Slamma", "DEHL", "Lorenzo_Borghese", "ALMA_Wis.", "Anne_Scurria", "father_Peruvians_alternately", "JULIE_ANDREWS", "Slim_Pickins", "Victoria_Secret_stunner", "BY'", "Sanam_Devdas", "pronounced_luh", "Pasha_Selim", "\u4e2d\u534e", "rson", "maternal_grandmothers", "IOWA_CITY_Ia", "Madame_de_Tourvel", "JAY'", "Sheika_Mozah_bint_Nasser", "Hotsy_Totsy", "D'_Ginto", "singer_Johnny_Paycheck", "uterine_prolapse_surgery", "SCOTTDALE_Pa.", "AdelaideNow_reports", "Marcus_Schenkenberg", "Clyse", "Obiter_Dicta", "comic_Sam_Kinison", "bitties", "ROCKVILLE_Ind.", "swimsuit_calendars", "Decicio_Smith", "Ma_ma", "Rie_Miyazawa", "celibate_chastity", "gwah", "ZAY'", "HER_Majesty", "Defrere", "Las_Madrinas", "\u7c3f_\u8042_\u7ffb", "Bea_Hamill", "ARCADIA_Calif._Trainer", "Bold_Badgett", "stakes_victress", "Hoppin_Frog", "Narumiya", "Flayfil", "hardman_Vinnie_Jones", "Marilyn_Monroe_lookalike", "Kivanc_Tatlitug", "Persis_Khambatta", "SINKING_SPRING_Pa.", "len_3rd", "DEAR_TRYING", "Farndon_Cheshire", "Krishna_Madiga", "daughter_Princess_Chulabhorn", "Marshall_Rooster_Cogburn", "Kitty_Kiernan", "Yokich", "Jarou", "Serdaris", "ee_ay", "Montifiore", "Chuderewicz", "Samuel_Le_Bihan", "filly_Proud_Spell", "Umm_Hiba", "pronounced_koo", "Sandy_Fonzo", "KOR'", "Fielder_Civil_kisses", "Federalsburg_Maryland", "Nikah_ceremony", "Brinke_Stevens", "Yakama_Tribal_Council", "Capuchin_Father", "wife_Callista_Bisek", "Beau_Dare", "Bedoni", "Arjun_Punj", "JOHNNY_KNOXVILLE", "cap_tain", "Alderwood_Boys", "Chi_Eta_Phi", "ringleader_Charles_Graner", "Savoies", "Lalla_Salma", "Mrs._Potiphar", "fahn", "name_Taylor_Sumers", "Vernita_Green", "Bollywood_baddie", "BENBROOK_Texas", "Assemblyman_Lou_Papan", "virgin_brides", "Cho_Eun", "CATHY_Freeman", "Uncle_Saul", "Lao_Brewery", "Ibo_tribe", "ruf", "rival_Edurne_Pasaban", "Hei_Shangri_La", "Mommy_dearest", "interest_Angola_Sonogal", "Ger_Monsun", "PUSSYCAT_DOLL", "Crown_Jewels_Condoms", "Lord_Marke", "Patootie", "Nora_Bey", "huntin_shootin", "Minister_Raymond_Tshibanda", "La_Nina_la_NEEN", "signature_Whoppers", "estranged_hubby_Kevin_Federline", "UR'", "pill_poppin", "GEHR'", "purebred_Arabians", "husbandly_duties", "VIAGRA_TIMING", "Hereford_heifer", "hushed_monotone_voice", "Pola_Uddin", "Wee_Jimmy_Krankie", "Kwakwanso", "Our_Galvinator", "shoh", "Codependency_Anonymous_Group", "LA'", "Taufa'ahau", "Invincible_Spirit_colt", "SAH'_dur", "MOUNT_CARMEL_Pa.", "watches_attentively", "SNL_spinoffs", "Seth_Nitschke", "Duns_Berwickshire", "defendant_Colleen_LaRose", "Silky_O'Sullivan", "Highcliff_Farm", "REN'", "Comestar", "Satisfied_Frog", "Jai_Maharashtra", "ATTICA_Ind.", "lover_Larry_Birkhead", "Tami_Megal", "chauvinist_pigs", "Phi_sorority", "Micronesian_immigrant", "Lia_Boldt", "Sugar_Tits", "actress_Kathy_Najimy", "zhoo", "Colombo_underboss", "Katsav_accusers", "Bess_Houdini", "rap_mogul_Diddy", "companions_Khin_Khin", "Van_Het", "Mastoi_tribe", "VITALY", "ROLLING_STONES_rocker", "womanizing_cad", "LILY_COLE", "paternal_grandfathers", "Lt._Col._Kurt_Kosmatka", "Kasseem_Jr.", "Ji_Ji", "Wilburforce", "VIAGRA_DOSE", "English_Sheepdogs", "pronounced_Kah", "Htet_Htet_Oo", "Brisk_Breeze", "Eau_du", "BY_MELANIE_EVANS", "Neovasc_Medical", "British_funnyman_RICKY", "4YO_mare", "Hemaida", "MONKTON", "Mrs_Mujuru", "BaGhana_BaGhana", "Shaaban_Abdel_Rahim", "Edward_Jazlowiecki_lawyer", "Ajman_Stud", "manly_pharaoh_even", "Serra_Madeira_Islands", "FRAY'", "panto_dames", "Khin_Myo", "dancer_Karima_El_Mahroug", "CROWN_Princess", "Baseball_HOFer", "Hasta_la_Pasta", "GIRLS_NEXT_DOOR", "Benedict_Groeschel", "Bousamra", "Ruby_Rubacuori_Ruby", "Monde_Bleu", "Un_homme_qui", "Taylor_Sumers", "Rapper_EMINEM", "Joe_Menchetti", "VAY'", "supermodel_NAOMI_CAMPBELL", "Supermodel_GISELE_BUNDCHEN", "Au_Lait", "Radar_Installed", "THOMAS_TOWNSHIP_Mich.", "Rafinesque", "Herman_Weinrich", "Abraxas_Antelope", "raspy_voiced_rocker", "Manurewa_Cosmopolitan_Club", "Paraone", "THE_LEOPARD", "Boy_Incorporated_LZB", "Dansili_filly", "Lumpy_Rutherford", "unwedded_bliss", "Bhavna_Sharma", "Scarvagh", "en_flagrante", "Mottu_Maid", "Dowager_Queen", "NEEN", "model_Monika_Zsibrita", "ROSIE_PEREZ", "Mattock_Ranger", "Valorous", "Surpreme", "Marwari_businessmen", "Grandparents_aunts", "Kimberley_Vlaeminck", "Lyn_Treece_Boys", "PDX_Update", "Virsa_Punjab", "eyelash_fluttering", "Pi_fraternity", "HUNTLEIGH_Mo.", "novelist_Jilly_Cooper", "Naha_Shuri_temple", "Yasmine_Al_Massri", "Mu_Gamma_Xi", "Mica_Ertegun", "Ocleppo", "VIAGRA_CONTRAINDICATIONS", "daughter_PEACHES", "trainer_Geoff_Wragg", "OVERNIGHT_DELIVERY", "Fitts_retiree", "de_Tourvel", "Lil_Lad", "north_easterner", "Aol_Weird_News", "Somewhat_improbably", "Sikh_panth", "Worcester_2m_7f", "Zainab_Jah", "OLYMPIC_medalist", "Enoch_Petrucelly", "collie_Lassie", "LOW'", "clumsiness_Holloway", "ayr", "OHR'", "ROLLING_STONES_guitarist", "LAH'_nee", "Ian_Beefy_Botham", "Awapuni_trainer", "Glamorous_Granny", "Chiang_Ching", "MidAtlantic_Cardiovascular_Associates", "Yeke", "Seaforth_Huron_Expositor", "Westley_Cary_Elwes", "Cate_Blanchett_Veronica_Guerin", "Bellas_Gate", "witch_Glinda", "wives_mistresses", "Woodsville_Walmart", "2YO_colt", "Manav_Sushant_Singh", "Pupi_Avati_Il", "Sigma_Beta_Rho", "Bishop_Christopher_Senyonjo", "Vodou_priest", "Rubel_Chowdhury", "Claddagh_Ring", "TAH'_duh_al", "al_Sadr_mook_TAH'", "ROBIN_GIBB", "GAHN'", "BY_THOMAS_RANSON", "sister_Carine_Jena", "Lyphard_mare", "summa_cum", "Semenya_grandmother_Maputhi", "Clare_Nuns", "Talac", "sex_hormones_androgens", "majeste", "Saint_Ballado_mare", "Carrie_Huchel", "Mae_Dok", "wife_Dieula", "Earnest_Sirls", "spoof_bar_mitzvah", "von_Boetticher", "Audwin_Mosby", "Case_presentationWe", "Vincent_Papandrea", "KRAY'", "Sergi_Benavent", "Le_Poisson", "Von_Cramm", "Patti_Mell", "Raymi_Coya", "Benjamin_BeBe_Winans", "Nana_Akosua", "Auld_Acquaintance", "Desire_Burunga", "Company_Wrangler_Nestea", "ask_Krisy_Plourde", "JUANITA_BYNUM", "livia", "GAMB", "Gail_Rosario_Dawson", "Ramgarhia_Sikh", "Catholic_nun_Sister", "FOUR_WEDDINGS_AND", "Robyn_Scherer", "brother_King_Athelstan", "Santo_Loquasto_Fences", "Wee_Frees", "MARISOL", "Soliloquy_Stakes", "Whatever_Spoetzl", "Marc'Aurelio", "mon_petit", "Sabbar_al_Mashhadani", "KAY'_lee", "m_zah_MAH'", "BY_TAMI_ALTHOFF", "hobbit_Samwise_Gamgee", "Bahiya_Hariri_sister", "daddy_Larry_Birkhead", "Sow_Tracey_Ullman", "coach_Viljo_Nousiainen", "Carmen_Lebbos", "conjoined_twins_Zainab", "Rob_Komosa", "ample_bosomed", "Ageing_rocker", "psychic_Oda"]
data/male_word_file.txt ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ countryman
2
+ fraternal
3
+ wizards
4
+ manservant
5
+ fathers
6
+ divo
7
+ actor
8
+ bachelor
9
+ papa
10
+ dukes
11
+ barman
12
+ countrymen
13
+ brideprice
14
+ hosts
15
+ potential_suitors
16
+ airmen
17
+ andropause
18
+ penis
19
+ prince
20
+ governors
21
+ abbot
22
+ men
23
+ widower
24
+ gentlemen
25
+ sorcerers
26
+ sir
27
+ bridegrooms
28
+ baron
29
+ househusbands
30
+ gods
31
+ nephew
32
+ widowers
33
+ lord
34
+ brother
35
+ grooms
36
+ priest
37
+ adultors
38
+ andrology
39
+ bellboys
40
+ his
41
+ marquis
42
+ princes
43
+ emperors
44
+ stallion
45
+ chairman
46
+ monastery
47
+ priests
48
+ boyhood
49
+ fellas
50
+ king
51
+ dudes
52
+ daddies
53
+ manservant
54
+ semen
55
+ spokesman
56
+ tailor
57
+ cowboys
58
+ dude
59
+ bachelors
60
+ barbershop
61
+ emperor
62
+ daddy
63
+ masculism
64
+ guys
65
+ enchanter
66
+ guy
67
+ fatherhood
68
+ androgen
69
+ cameramen
70
+ godfather
71
+ strongman
72
+ god
73
+ patriarch
74
+ uncle
75
+ chairmen
76
+ sir
77
+ brotherhood
78
+ host
79
+ testosterone
80
+ husband
81
+ dad
82
+ steward
83
+ males
84
+ cialis
85
+ spokesmen
86
+ pa
87
+ beau
88
+ stud
89
+ bachelor
90
+ wizard
91
+ sir
92
+ nephews
93
+ fathered
94
+ bull
95
+ beaus
96
+ councilmen
97
+ landladies
98
+ grandson
99
+ fiances
100
+ stepfathers
101
+ horsewomen
102
+ grandfathers
103
+ adultor
104
+ schoolboy
105
+ rooster
106
+ grandsons
107
+ bachelor
108
+ cameraman
109
+ dads
110
+ him
111
+ master
112
+ lad
113
+ policeman
114
+ monk
115
+ actors
116
+ salesmen
117
+ boyfriend
118
+ councilman
119
+ fella
120
+ statesman
121
+ paternal
122
+ chap
123
+ landlord
124
+ brethren
125
+ lords
126
+ blokes
127
+ fraternity
128
+ bellboy
129
+ duke
130
+ ballet_dancer
131
+ dudes
132
+ fiance
133
+ colts
134
+ husbands
135
+ suitor
136
+ maternity
137
+ he
138
+ businessman
139
+ masseurs
140
+ hero
141
+ deer
142
+ busboys
143
+ boyfriends
144
+ kings
145
+ brothers
146
+ masters
147
+ stepfather
148
+ son
149
+ studs
150
+ cowboy
151
+ mentleman
152
+ sons
153
+ baritone
154
+ salesman
155
+ paramour
156
+ male_host
157
+ monks
158
+ menservants
159
+ mr.
160
+ headmasters
161
+ lads
162
+ congressman
163
+ airman
164
+ househusband
165
+ priest
166
+ barmen
167
+ barons
168
+ abbots
169
+ handyman
170
+ beard
171
+ fraternities
172
+ stewards
173
+ colt
174
+ czar
175
+ stepsons
176
+ himself
177
+ boys
178
+ lions
179
+ gentleman
180
+ penis
181
+ his
182
+ masseur
183
+ bulls
184
+ uncles
185
+ bloke
186
+ beards
187
+ hubby
188
+ lion
189
+ sorcerer
190
+ macho
191
+ father
192
+ gays
193
+ male
194
+ waiters
195
+ sperm
196
+ prostate
197
+ stepson
198
+ prostatic_utricle
199
+ businessmen
200
+ heir
201
+ waiter
202
+ headmaster
203
+ man
204
+ governor
205
+ god
206
+ bridegroom
207
+ grandpa
208
+ groom
209
+ dude
210
+ gay
211
+ gents
212
+ boy
213
+ grandfather
214
+ gelding
215
+ roosters
216
+ prostatic_utricle
217
+ priests
218
+ manservants
219
+ stailor
220
+ busboy
221
+ heros
eval.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.cluster import AgglomerativeClustering, KMeans
3
+ from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
4
+ from web.datasets.categorization import fetch_AP, fetch_battig, fetch_BLESS, fetch_ESSLI_1a, fetch_ESSLI_2b, \
5
+ fetch_ESSLI_2c
6
+ from web.analogy import *
7
+ from six import iteritems
8
+ from web.embedding import Embedding
9
+ from web.evaluate import calculate_purity, evaluate_categorization, evaluate_on_semeval_2012_2, evaluate_analogy, \
10
+ evaluate_on_WordRep, evaluate_similarity
11
+
12
+ def evaluate_similarity_pearson(w, X, y):
13
+ """
14
+ Calculate Pearson correlation between cosine similarity of the model
15
+ and human rated similarity of word pairs
16
+ Parameters
17
+ ----------
18
+ w : Embedding or dict
19
+ Embedding or dict instance.
20
+ X: array, shape: (n_samples, 2)
21
+ Word pairs
22
+ y: vector, shape: (n_samples,)
23
+ Human ratings
24
+ Returns
25
+ -------
26
+ cor: float
27
+ Pearson correlation
28
+ """
29
+ if isinstance(w, dict):
30
+ w = Embedding.from_dict(w)
31
+
32
+ missing_words = 0
33
+ words = w.vocabulary.word_id
34
+ for query in X:
35
+ for query_word in query:
36
+ if query_word not in words:
37
+ missing_words += 1
38
+ if missing_words > 0:
39
+ print("Missing {} words. Will replace them with mean vector".format(missing_words))
40
+
41
+ new_x = []
42
+ new_y = []
43
+ for i in range(len(X)):
44
+ if X[i, 0] in words and X[i, 1] in words:
45
+ new_x.append(X[i])
46
+ new_y.append(y[i])
47
+
48
+ X = np.array(new_x)
49
+ y = np.array(new_y)
50
+
51
+ mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
52
+ A = np.vstack(list(w.get(word, mean_vector) for word in X[:, 0]))
53
+ B = np.vstack(list(w.get(word, mean_vector) for word in X[:, 1]))
54
+ scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
55
+ return scipy.stats.pearsonr(scores, y.squeeze())
56
+
57
+ def evaluate_similarity(w, X, y):
58
+ """
59
+ Calculate Spearman correlation between cosine similarity of the model
60
+ and human rated similarity of word pairs
61
+
62
+ Parameters
63
+ ----------
64
+ w : Embedding or dict
65
+ Embedding or dict instance.
66
+
67
+ X: array, shape: (n_samples, 2)
68
+ Word pairs
69
+
70
+ y: vector, shape: (n_samples,)
71
+ Human ratings
72
+
73
+ Returns
74
+ -------
75
+ cor: float
76
+ Spearman correlation
77
+ """
78
+ if isinstance(w, dict):
79
+ w = Embedding.from_dict(w)
80
+
81
+ missing_words = 0
82
+ words = w.vocabulary.word_id
83
+ for query in X:
84
+ for query_word in query:
85
+ if query_word not in words:
86
+ missing_words += 1
87
+ # if missing_words > 0:
88
+ # print("Missing {} words. Will replace them with mean vector".format(missing_words))
89
+
90
+ new_x = []
91
+ new_y = []
92
+ exist_cnt = 0
93
+
94
+ for i in range(len(X)):
95
+ if X[i, 0] in words and X[i, 1] in words:
96
+ new_x.append(X[i])
97
+ new_y.append(y[i])
98
+ exist_cnt += 1
99
+
100
+ print('exist {} in {}'.format(exist_cnt, len(X)))
101
+ X = np.array(new_x)
102
+ y = np.array(new_y)
103
+
104
+
105
+ mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
106
+ A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
107
+ B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
108
+ # scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
109
+ scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
110
+ return scipy.stats.spearmanr(scores, y).correlation
111
+
112
+
113
+ def evaluate_simi(wv, w2i, vocab):
114
+ wv_dict = dict()
115
+ for w in vocab:
116
+ wv_dict[w] = wv[w2i[w], :]
117
+
118
+ if isinstance(wv_dict, dict):
119
+ w = Embedding.from_dict(wv_dict)
120
+
121
+ # Calculate results on similarity
122
+ print("Calculating similarity benchmarks")
123
+ similarity_tasks = {
124
+ "WS353": fetch_WS353(),
125
+ "RG65": fetch_RG65(),
126
+ # "WS353R": fetch_WS353(which="relatedness"),
127
+ # "WS353S": fetch_WS353(which="similarity"),
128
+ "SimLex999": fetch_SimLex999(),
129
+ "MTurk": fetch_MTurk(),
130
+ "RW": fetch_RW(),
131
+ "MEN": fetch_MEN(),
132
+ }
133
+
134
+ # similarity_results = {}
135
+
136
+ for name, data in iteritems(similarity_tasks):
137
+ print("Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}".format(
138
+ name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
139
+ score = evaluate_similarity(w, data.X, data.y)
140
+ print("Spearman correlation of scores on {} {}".format(name, score))
141
+ # score, p_value = evaluate_similarity_pearson(w, data.X, data.y)
142
+ # print("Pearson correlation of scores on {} {}, p value: {}".format(name, score, p_value))
143
+
144
+ def evaluate_categorization(w, X, y, method="kmeans", seed=None):
145
+ """
146
+ Evaluate embeddings on categorization task.
147
+
148
+ Parameters
149
+ ----------
150
+ w: Embedding or dict
151
+ Embedding to test.
152
+
153
+ X: vector, shape: (n_samples, )
154
+ Vector of words.
155
+
156
+ y: vector, shape: (n_samples, )
157
+ Vector of cluster assignments.
158
+
159
+ method: string, default: "all"
160
+ What method to use. Possible values are "agglomerative", "kmeans", "all.
161
+ If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
162
+ hyperparameter tuning to avoid overfitting).
163
+ If "kmeans" is passed, method will fit KMeans.
164
+ In both cases number of clusters is preset to the correct value.
165
+
166
+ seed: int, default: None
167
+ Seed passed to KMeans.
168
+
169
+ Returns
170
+ -------
171
+ purity: float
172
+ Purity of the best obtained clustering.
173
+
174
+ Notes
175
+ -----
176
+ KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
177
+ tasks available in the package).
178
+ """
179
+
180
+ if isinstance(w, dict):
181
+ w = Embedding.from_dict(w)
182
+
183
+ assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"
184
+
185
+ mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
186
+ new_x = []
187
+ new_y = []
188
+ exist_cnt = 0
189
+
190
+ for idx, word in enumerate(X.flatten()):
191
+ if word in w :
192
+ new_x.append(X[idx])
193
+ new_y.append(y[idx])
194
+ exist_cnt += 1
195
+
196
+ print('exist {} in {}'.format(exist_cnt, len(X)))
197
+ X = np.array(new_x)
198
+ y = np.array(new_y)
199
+
200
+ words = np.vstack([w.get(word, mean_vector) for word in X.flatten()])
201
+ ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)
202
+
203
+ # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
204
+ # KMeans
205
+ best_purity = 0
206
+
207
+ if method == "all" or method == "agglomerative":
208
+ best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
209
+ affinity="euclidean",
210
+ linkage="ward").fit_predict(words[ids]))
211
+ logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
212
+ for affinity in ["cosine", "euclidean"]:
213
+ for linkage in ["average", "complete"]:
214
+ purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
215
+ affinity=affinity,
216
+ linkage=linkage).fit_predict(words[ids]))
217
+ logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
218
+ best_purity = max(best_purity, purity)
219
+
220
+ if method == "all" or method == "kmeans":
221
+ purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
222
+ fit_predict(words[ids]))
223
+ logger.debug("Purity={:.3f} using KMeans".format(purity))
224
+ best_purity = max(purity, best_purity)
225
+
226
+ return best_purity
227
+
228
+ def evaluate_cate(wv, w2i, vocab, method="all", seed=None):
229
+ """
230
+ method: string, default: "all"
231
+ What method to use. Possible values are "agglomerative", "kmeans", "all.
232
+ If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
233
+ hyperparameter tuning to avoid overfitting).
234
+ If "kmeans" is passed, method will fit KMeans.
235
+ In both cases number of clusters is preset to the correct value.
236
+ seed: int, default: None
237
+ Seed passed to KMeans.
238
+ """
239
+ wv_dict = dict()
240
+ for w in vocab:
241
+ wv_dict[w] = wv[w2i[w], :]
242
+
243
+ if isinstance(wv_dict, dict):
244
+ w = Embedding.from_dict(wv_dict)
245
+
246
+ # Calculate results on categorization
247
+ print("Calculating categorization benchmarks")
248
+ categorization_tasks = {
249
+ "AP": fetch_AP(),
250
+ "ESSLI_2c": fetch_ESSLI_2c(),
251
+ "ESSLI_2b": fetch_ESSLI_2b(),
252
+ "ESSLI_1a": fetch_ESSLI_1a(),
253
+ "Battig": fetch_battig(),
254
+ "BLESS": fetch_BLESS(),
255
+ }
256
+
257
+ categorization_results = {}
258
+
259
+ # Calculate results using helper function
260
+ for name, data in iteritems(categorization_tasks):
261
+ print("Sample data from {}, num of samples: {} : \"{}\" is assigned class {}".format(
262
+ name, len(data.X), data.X[0], data.y[0]))
263
+ categorization_results[name] = evaluate_categorization(w, data.X, data.y, method=method, seed=None)
264
+ print("Cluster purity on {} {}".format(name, categorization_results[name]))
265
+
266
+ def evaluate_analogy_google(W, vocab):
267
+ """Evaluate the trained w vectors on a variety of tasks"""
268
+
269
+ filenames = [
270
+ 'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
271
+ 'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
272
+ 'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
273
+ 'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
274
+ 'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
275
+ ]
276
+ prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
277
+
278
+ # to avoid memory overflow, could be increased/decreased
279
+ # depending on system and vocab size
280
+ split_size = 100
281
+
282
+ correct_sem = 0; # count correct semantic questions
283
+ correct_syn = 0; # count correct syntactic questions
284
+ correct_tot = 0 # count correct questions
285
+ count_sem = 0; # count all semantic questions
286
+ count_syn = 0; # count all syntactic questions
287
+ count_tot = 0 # count all questions
288
+ full_count = 0 # count all questions, including those with unknown words
289
+
290
+ for i in range(len(filenames)):
291
+ with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
292
+ full_data = [line.rstrip().split(' ') for line in f]
293
+ full_count += len(full_data)
294
+ data = [x for x in full_data if all(word in vocab for word in x)]
295
+
296
+ indices = np.array([[vocab[word] for word in row] for row in data])
297
+ ind1, ind2, ind3, ind4 = indices.T
298
+
299
+ predictions = np.zeros((len(indices),))
300
+ num_iter = int(np.ceil(len(indices) / float(split_size)))
301
+ for j in range(num_iter):
302
+ subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
303
+
304
+ pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
305
+ + W[ind3[subset], :])
306
+ #cosine similarity if input W has been normalized
307
+ dist = np.dot(W, pred_vec.T)
308
+
309
+ for k in range(len(subset)):
310
+ dist[ind1[subset[k]], k] = -np.Inf
311
+ dist[ind2[subset[k]], k] = -np.Inf
312
+ dist[ind3[subset[k]], k] = -np.Inf
313
+
314
+ # predicted word index
315
+ predictions[subset] = np.argmax(dist, 0).flatten()
316
+
317
+ val = (ind4 == predictions) # correct predictions
318
+ count_tot = count_tot + len(ind1)
319
+ correct_tot = correct_tot + sum(val)
320
+ if i < 5:
321
+ count_sem = count_sem + len(ind1)
322
+ correct_sem = correct_sem + sum(val)
323
+ else:
324
+ count_syn = count_syn + len(ind1)
325
+ correct_syn = correct_syn + sum(val)
326
+
327
+ print("%s:" % filenames[i])
328
+ print('ACCURACY TOP1: %.2f%% (%d/%d)' %
329
+ (np.mean(val) * 100, np.sum(val), len(val)))
330
+
331
+ print('Questions seen/total: %.2f%% (%d/%d)' %
332
+ (100 * count_tot / float(full_count), count_tot, full_count))
333
+ print('Semantic accuracy: %.2f%% (%i/%i)' %
334
+ (100 * correct_sem / float(count_sem), correct_sem, count_sem))
335
+ print('Syntactic accuracy: %.2f%% (%i/%i)' %
336
+ (100 * correct_syn / float(count_syn), correct_syn, count_syn))
337
+ print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
338
+
339
+
340
+ def evaluate_analogy_msr(W, vocab, file_name='EN-MSR.txt'):
341
+ """Evaluate the trained word vectors on a variety of tasks"""
342
+
343
+ prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
344
+
345
+ # to avoid memory overflow, could be increased/decreased
346
+ # depending on system and vocab size
347
+ split_size = 100
348
+
349
+ correct_sem = 0; # count correct semantic questions
350
+ correct_syn = 0; # count correct syntactic questions
351
+ correct_tot = 0 # count correct questions
352
+ count_sem = 0; # count all semantic questions
353
+ count_syn = 0; # count all syntactic questions
354
+ count_tot = 0 # count all questions
355
+ full_count = 0 # count all questions, including those with unknown words
356
+
357
+ with open('%s/%s' % (prefix, file_name), 'r') as f:
358
+ full_data = []
359
+ for line in f:
360
+ tokens = line.rstrip().split(' ')
361
+ full_data.append([tokens[0], tokens[1], tokens[2], tokens[4]])
362
+ full_count += len(full_data)
363
+ data = [x for x in full_data if all(word in vocab for word in x)]
364
+
365
+ indices = np.array([[vocab[word] for word in row] for row in data])
366
+ ind1, ind2, ind3, ind4 = indices.T
367
+
368
+ predictions = np.zeros((len(indices),))
369
+ num_iter = int(np.ceil(len(indices) / float(split_size)))
370
+ for j in range(num_iter):
371
+ subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
372
+
373
+ pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
374
+ + W[ind3[subset], :])
375
+ #cosine similarity if input W has been normalized
376
+ dist = np.dot(W, pred_vec.T)
377
+
378
+ for k in range(len(subset)):
379
+ dist[ind1[subset[k]], k] = -np.Inf
380
+ dist[ind2[subset[k]], k] = -np.Inf
381
+ dist[ind3[subset[k]], k] = -np.Inf
382
+
383
+ # predicted word index
384
+ predictions[subset] = np.argmax(dist, 0).flatten()
385
+
386
+ val = (ind4 == predictions) # correct predictions
387
+ count_tot = count_tot + len(ind1)
388
+ correct_tot = correct_tot + sum(val)
389
+
390
+ # print("%s:" % filenames[i])
391
+ print(len(val))
392
+ print('ACCURACY TOP1-MSR: %.2f%% (%d/%d)' %
393
+ (np.mean(val) * 100, np.sum(val), len(val)))
394
+
395
+ def evaluate_analogy_semeval2012(w_dict):
396
+ score = evaluate_on_semeval_2012_2(w_dict)['all']
397
+ print("Analogy prediction accuracy on {} {}".format("SemEval2012", score))
398
+
399
+ def evaluate_ana(wv, w2i, vocab):
400
+ W_norm = np.zeros(wv.shape)
401
+ d = (np.sum(wv ** 2, 1) ** (0.5))
402
+ W_norm = (wv.T / d).T
403
+
404
+ evaluate_analogy_msr(W_norm, w2i)
405
+ evaluate_analogy_google(W_norm, w2i)
406
+
407
+ wv_dict = dict()
408
+ for w in vocab:
409
+ wv_dict[w] = W_norm[w2i[w], :]
410
+
411
+ if isinstance(wv_dict, dict):
412
+ w = Embedding.from_dict(wv_dict)
413
+ evaluate_analogy_semeval2012(w)
414
+
415
+ # analogy_tasks = {
416
+ # "Google": fetch_google_analogy(),
417
+ # "MSR": fetch_msr_analogy()
418
+ # }
419
+
420
+ # analogy_results = {}
421
+
422
+ # for name, data in iteritems(analogy_tasks):
423
+ # analogy_results[name] = evaluate_analogy(w, data.X, data.y)
424
+ # print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
utils.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ from tqdm import tqdm
3
+ import pickle
4
+
5
+ import scipy
6
+ import numpy as np
7
+ from numpy import linalg as LA
8
+ from sklearn.decomposition import PCA
9
+
10
+ # Experiment 1
11
+ WEAT_words = {
12
+ 'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
13
+ 'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
14
+ 'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
15
+ 'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
16
+ 'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
17
+ 'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
18
+ 'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
19
+ 'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
20
+ }
21
+
22
+
23
+ def has_punct(w):
24
+
25
+ if any([c in string.punctuation for c in w]):
26
+ return True
27
+ return False
28
+
29
+ def has_digit(w):
30
+
31
+ if any([c in '0123456789' for c in w]):
32
+ return True
33
+ return False
34
+
35
+ def limit_vocab(wv, w2i, vocab, exclude = None):
36
+ vocab_limited = []
37
+ for w in tqdm(vocab[:50000]):
38
+ if w.lower() != w:
39
+ continue
40
+ if len(w) >= 20:
41
+ continue
42
+ if has_digit(w):
43
+ continue
44
+ if '_' in w:
45
+ p = [has_punct(subw) for subw in w.split('_')]
46
+ if not any(p):
47
+ vocab_limited.append(w)
48
+ continue
49
+ if has_punct(w):
50
+ continue
51
+ vocab_limited.append(w)
52
+
53
+ if exclude:
54
+ vocab_limited = list(set(vocab_limited) - set(exclude))
55
+
56
+ print("size of vocabulary:", len(vocab_limited))
57
+
58
+ wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
59
+ for i,w in enumerate(vocab_limited):
60
+ wv_limited[i,:] = wv[w2i[w],:]
61
+
62
+ w2i_limited = {w: i for i, w in enumerate(vocab_limited)}
63
+
64
+ return vocab_limited, wv_limited, w2i_limited
65
+
66
+ def norm_stand(wv):
67
+ W_norm = np.zeros(wv.shape)
68
+ d = (np.sum(wv ** 2, 1) ** (0.5))
69
+ W_norm = (wv.T / d).T
70
+ return W_norm
71
+
72
+ def normalize(wv):
73
+
74
+ # normalize vectors
75
+ norms = np.apply_along_axis(LA.norm, 1, wv)
76
+ wv = wv / norms[:, np.newaxis]
77
+ return wv
78
+
79
+
80
+ def topK(w, wv, w2i, vocab, k=10):
81
+
82
+ # extract the word vector for word w
83
+ idx = w2i[w]
84
+ vec = wv[idx, :]
85
+
86
+ # compute similarity of w with all words in the vocabulary
87
+ sim = wv.dot(vec)
88
+ # sim = []
89
+ # for i in range(len(wv)):
90
+ # sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
91
+ # sim = np.array(sim)
92
+
93
+ # sort similarities by descending order
94
+ sort_sim = (sim.argsort())[::-1]
95
+
96
+ # choose topK
97
+ best = sort_sim[:(k+1)]
98
+
99
+ return [vocab[i] for i in best if i!=idx]
100
+
101
+
102
+ def similarity(w1, w2, wv, w2i):
103
+
104
+ i1 = w2i[w1]
105
+ i2 = w2i[w2]
106
+ vec1 = wv[i1, :]
107
+ vec2 = wv[i2, :]
108
+
109
+ return 1-scipy.spatial.distance.cosine(vec1, vec2)
110
+
111
+
112
+
113
+ def drop(u, v):
114
+ return u - v * u.dot(v) / v.dot(v)
115
+
116
+ from sklearn.decomposition import PCA
117
+ from sklearn import preprocessing
118
+
119
+ def doPCA(pairs, wv, w2i):
120
+
121
+ matrix = []
122
+ cnt = 0
123
+
124
+ if type(pairs[0]) is list:
125
+ for a, b in pairs:
126
+ if not (a in w2i and b in w2i): continue
127
+ center = (wv[w2i[a], :] + wv[w2i[b], :])/2
128
+ matrix.append(wv[w2i[a], :] - center)
129
+ matrix.append(wv[w2i[b], :] - center)
130
+ cnt += 1
131
+ else:
132
+ for a in pairs:
133
+ if not (a in w2i): continue
134
+ matrix.append(wv[w2i[a], :])
135
+ cnt += 1
136
+
137
+ embeds = np.array(matrix)
138
+ wv_mean = np.mean(np.array(embeds), axis=0)
139
+ wv_hat = np.zeros(embeds.shape).astype(float)
140
+
141
+ for i in range(len(embeds)):
142
+ wv_hat[i, :] = embeds[i, :] - wv_mean
143
+ matrix = wv_hat
144
+
145
+ matrix = np.array(matrix)
146
+ pca = PCA()
147
+ pca.fit(matrix)
148
+ print('pairs used in PCA: ', cnt)
149
+ return pca
150
+
151
+ # get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
152
+ import operator
153
+ def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):
154
+
155
+ tuples = []
156
+
157
+ sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
158
+ female = [item[0] for item in sorted_g[:size]]
159
+ male = [item[0] for item in sorted_g[-size:]]
160
+ # vocab = male + female
161
+ selected = female + male if size > 0 else vocab
162
+
163
+ for w in selected:
164
+
165
+ top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]
166
+
167
+ m = 0
168
+ f = 0
169
+ for t in top:
170
+ if gender_bias_bef[t] > 0:
171
+ m+=1
172
+ else:
173
+ f+=1
174
+
175
+ tuples.append((w, gender_bias_bef[w], m, f))
176
+
177
+ return tuples
178
+
179
+ def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict):
180
+
181
+ wv = normalize(wv)
182
+
183
+ tuples = []
184
+ for w in words:
185
+ if w not in gender_bias_dict:
186
+ continue
187
+
188
+ top = topK(w, wv, w2i, vocab, k=105)[:100]
189
+
190
+ m = 0
191
+ f = 0
192
+ for t in top:
193
+ if gender_bias_dict[t] > 0:
194
+ m+=1
195
+ else:
196
+ f+=1
197
+
198
+ tuples.append((w, gender_bias_dict[w], m, f))
199
+
200
+ return tuples
201
+
202
+ # compute correlation between bias-by-projection and bias-by-neighbors
203
+
204
+ import scipy.stats
205
+
206
+ def pearson(a,b):
207
+
208
+ return scipy.stats.pearsonr(a,b)
209
+
210
+ def compute_corr(tuples, i1, i2):
211
+
212
+ a = []
213
+ b = []
214
+ for t in tuples:
215
+ a.append(t[i1])
216
+ b.append(t[i2])
217
+ assert(len(a)==len(b))
218
+ print('pearson: ', scipy.stats.pearsonr(a,b))
219
+ print('spearman: ', scipy.stats.spearmanr(a, b))
220
+
221
+ # Auxiliary finctions
222
+
223
+ from sklearn.cluster import KMeans
224
+ from sklearn.manifold import TSNE
225
+
226
+ def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):
227
+
228
+ # perform TSNE
229
+
230
+ X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
231
+ for x,p,y in zip(X_embedded, y_pred, y_true):
232
+ if p:
233
+ if y:
234
+ ax.scatter(x[0], x[1], marker = '.', c = 'c')
235
+ else:
236
+ ax.scatter(x[0], x[1], marker = 'x', c = 'c')
237
+ else:
238
+ if y:
239
+ ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
240
+ else:
241
+ ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')
242
+
243
+
244
+ ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)
245
+
246
+
247
+ def extract_vectors(words, wv, w2i):
248
+
249
+ X = [wv[w2i[x],:] for x in words]
250
+
251
+ return X
252
+
253
+
254
+ def cluster_and_visualize(words, X, random_state, y_true, num=2):
255
+
256
+ y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
257
+ # fig, axs = plt.subplots(figsize=(6, 3))
258
+ # visualize(X, y_true, y_pred, axs, 'Original', random_state)
259
+ correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
260
+ print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))
261
+
262
+
263
+ import scipy.stats
264
+ from sklearn import svm
265
+ def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):
266
+
267
+ X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
268
+ Y_train = [1]*size_train + [0]*size_train
269
+ X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
270
+ Y_test = [1]*size_test + [0]*size_test
271
+
272
+ clf = svm.SVC(gamma='auto')
273
+ clf.fit(X_train, Y_train)
274
+
275
+ preds = clf.predict(X_test)
276
+
277
+ accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
278
+ acc = float(sum(accuracy))/len(accuracy)
279
+ print('accuracy:', float(sum(accuracy))/len(accuracy))
280
+
281
+ return acc
282
+
283
+
284
+ # Auxiliary functions for experiments by Caliskan et al.
285
+
286
+ import scipy
287
+ import scipy.misc as misc
288
+ import itertools
289
+
290
+
291
+ def s_word(w, A, B, wv, w2i, vocab, all_s_words):
292
+
293
+ if w in all_s_words:
294
+ return all_s_words[w]
295
+
296
+ mean_a = []
297
+ mean_b = []
298
+
299
+ for a in A:
300
+ mean_a.append(similarity(w, a, wv, w2i))
301
+ for b in B:
302
+ mean_b.append(similarity(w, b, wv, w2i))
303
+
304
+ mean_a = sum(mean_a)/float(len(mean_a))
305
+ mean_b = sum(mean_b)/float(len(mean_b))
306
+
307
+ all_s_words[w] = mean_a - mean_b
308
+
309
+ return all_s_words[w]
310
+
311
+
312
+ def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words):
313
+
314
+ total = 0
315
+ for x in X:
316
+ total += s_word(x, A, B, wv, w2i, vocab, all_s_words)
317
+ for y in Y:
318
+ total -= s_word(y, A, B, wv, w2i, vocab, all_s_words)
319
+
320
+ return total
321
+
322
+
323
+ def p_value_exhust(X, Y, A, B, wv, w2i, vocab):
324
+
325
+ if len(X) > 10:
326
+ print('might take too long, use sampled version: p_value')
327
+ return
328
+
329
+ assert(len(X) == len(Y))
330
+
331
+ all_s_words = {}
332
+ s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
333
+
334
+ union = set(X+Y)
335
+ subset_size = int(len(union)/2)
336
+
337
+ larger = 0
338
+ total = 0
339
+ for subset in set(itertools.combinations(union, subset_size)):
340
+ total += 1
341
+ Xi = list(set(subset))
342
+ Yi = list(union - set(subset))
343
+ if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
344
+ larger += 1
345
+ print('num of samples', total)
346
+ return larger/float(total)
347
+
348
+ def association_diff(t, A, B, wv, w2i):
349
+
350
+ mean_a = []
351
+ mean_b = []
352
+
353
+ for a in A:
354
+ mean_a.append(similarity(t, a, wv, w2i))
355
+ for b in B:
356
+ mean_b.append(similarity(t, b, wv, w2i))
357
+
358
+ mean_a = sum(mean_a)/float(len(mean_a))
359
+ mean_b = sum(mean_b)/float(len(mean_b))
360
+
361
+ return mean_a - mean_b
362
+
363
+ def effect_size(X, Y, A, B, wv, w2i, vocab):
364
+
365
+ assert(len(X) == len(Y))
366
+ assert(len(A) == len(B))
367
+
368
+ norm_x = []
369
+ norm_y = []
370
+
371
+ for x in X:
372
+ norm_x.append(association_diff(x, A, B, wv, w2i))
373
+ for y in Y:
374
+ norm_y.append(association_diff(y, A, B, wv, w2i))
375
+
376
+ std = np.std(norm_x+norm_y, ddof=1)
377
+ norm_x = sum(norm_x) / float(len(norm_x))
378
+ norm_y = sum(norm_y) / float(len(norm_y))
379
+
380
+ return (norm_x-norm_y)/std
381
+
382
+
383
+ def p_value_sample(X, Y, A, B, wv, w2i, vocab):
384
+
385
+ random.seed(10)
386
+ np.random.seed(10)
387
+ all_s_words = {}
388
+
389
+ assert(len(X) == len(Y))
390
+ length = len(X)
391
+
392
+ s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
393
+
394
+ num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100))
395
+ print('num of samples', num_of_samples)
396
+ larger = 0
397
+ for i in range(num_of_samples):
398
+ permute = np.random.permutation(X+Y)
399
+ Xi = permute[:length]
400
+ Yi = permute[length:]
401
+ if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
402
+ larger += 1
403
+
404
+ return larger/float(num_of_samples)
405
+
406
+