Spaces:
Runtime error
Runtime error
import spacy | |
from spacy.matcher import Matcher | |
from collections import Counter | |
from operator import itemgetter | |
import pandas as pd | |
from tqdm import tqdm | |
import scipy.stats as stats | |
from argparse import ArgumentParser | |
def calculate_dict(female_array, male_array): | |
counter_f_h = Counter(female_array) | |
counter_m_h = Counter(male_array) | |
# make sure there is no key lookup error | |
for key in set(counter_f_h) - set(counter_m_h): | |
counter_m_h[key] = 0 | |
for key in set(counter_m_h) - set(counter_f_h): | |
counter_f_h[key] = 0 | |
return counter_f_h, counter_m_h | |
def odds_ratio(f_dict, m_dict, topk=50, threshold=20): | |
very_small_value = 0.00001 | |
if len(f_dict.keys()) != len(m_dict.keys()): | |
raise Exception('The category for analyzing the male and female should be the same!') | |
else: | |
odds_ratio = {} | |
total_num_f = sum(f_dict.values()) | |
total_num_m = sum(m_dict.values()) | |
for key in f_dict.keys(): | |
m_num = m_dict[key] | |
f_num = f_dict[key] | |
non_f_num = total_num_f - f_num | |
non_m_num = total_num_m - m_num | |
if f_num >= threshold and m_num >= threshold: | |
# we only consider the events where there are at least {thresohld} occurences for both gender | |
odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2) | |
else: | |
continue | |
return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict( | |
sorted(odds_ratio.items(), key=itemgetter(1))[:topk]) | |
class Word_Extraction: | |
def __init__(self, word_types=None): | |
self.nlp = spacy.load("en_core_web_sm") | |
self.matcher = Matcher(self.nlp.vocab) | |
patterns = [] | |
for word_type in word_types: | |
if word_type == 'noun': | |
patterns.append([{'POS':'NOUN'}]) | |
elif word_type == 'adj': | |
patterns.append([{'POS':'ADJ'}]) | |
elif word_type == 'verb': | |
patterns.append([{"POS": "VERB"}]) | |
self.matcher.add("demo", patterns) | |
def extract_word(self, doc): | |
doc = self.nlp(doc) | |
matches = self.matcher(doc) | |
vocab = [] | |
for match_id, start, end in matches: | |
string_id = self.nlp.vocab.strings[match_id] # Get string representation | |
span = doc[start:end] # The matched span | |
vocab.append(span.text) | |
return vocab | |
def compute_lexical_content(list1, list2, threshold=10): | |
noun_f, noun_m = [], [] | |
adj_f, adj_m = [], [] | |
len_f, len_m = [], [] | |
noun_extract = Word_Extraction(['noun']) | |
adj_extract = Word_Extraction(['adj']) | |
ability_m, standout_m, ability_f, standout_f = 0, 0, 0, 0 | |
masculine_m, feminine_m, masculine_f, feminine_f = 0, 0, 0, 0 | |
for i in tqdm(range(len(list1)), ascii=True): | |
noun_vocab_f = noun_extract.extract_word(list1[i]) | |
# For normal analysis | |
for v in noun_vocab_f: | |
v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower() | |
noun_f.append(v) | |
adj_vocab_f = adj_extract.extract_word(list1[i]) | |
for v in adj_vocab_f: | |
v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower() | |
adj_f.append(v) | |
for i in tqdm(range(len(list2)), ascii=True): | |
noun_vocab_m = noun_extract.extract_word(list2[i]) | |
# For normal analysis | |
for v in noun_vocab_m: | |
v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower() | |
noun_m.append(v) | |
adj_vocab_m = adj_extract.extract_word(list2[i]) | |
for v in adj_vocab_m: | |
v = v.split()[0].replace('<return>', '').replace('<return', '').strip(',./?').lower() | |
adj_m.append(v) | |
# For normal analysis | |
noun_counter_f, noun_counter_m = calculate_dict(noun_f, noun_m) | |
noun_res_m, noun_res_f = odds_ratio(noun_counter_f, noun_counter_m, threshold=threshold) | |
adj_counter_f, adj_counter_m = calculate_dict(adj_f, adj_m) | |
adj_res_m, adj_res_f = odds_ratio(adj_counter_f, adj_counter_m, threshold=threshold) | |
output = {} | |
output['noun_male'] = ", ".join(list(noun_res_m.keys())[:10]) | |
output['noun_female'] = ", ".join(list(noun_res_f.keys())[:10]) | |
output['adj_male'] = ", ".join(list(adj_res_m.keys())[:10]) | |
output['adj_female'] = ", ".join(list(adj_res_f.keys())[:10]) | |
# want to make df where cols are key of output and second col is list of values | |
data = { | |
'male': [output['noun_male'], output['adj_male']], | |
'female': [output['noun_female'], output['adj_female']] | |
} | |
df = pd.DataFrame(data, index=['noun', 'adj']) | |
return df |