import spacy from spacy.matcher import Matcher from collections import Counter from operator import itemgetter import pandas as pd from tqdm import tqdm import scipy.stats as stats from argparse import ArgumentParser def calculate_dict(female_array, male_array): counter_f_h = Counter(female_array) counter_m_h = Counter(male_array) # make sure there is no key lookup error for key in set(counter_f_h) - set(counter_m_h): counter_m_h[key] = 0 for key in set(counter_m_h) - set(counter_f_h): counter_f_h[key] = 0 return counter_f_h, counter_m_h def odds_ratio(f_dict, m_dict, topk=50, threshold=20): very_small_value = 0.00001 if len(f_dict.keys()) != len(m_dict.keys()): raise Exception('The category for analyzing the male and female should be the same!') else: odds_ratio = {} total_num_f = sum(f_dict.values()) total_num_m = sum(m_dict.values()) for key in f_dict.keys(): m_num = m_dict[key] f_num = f_dict[key] non_f_num = total_num_f - f_num non_m_num = total_num_m - m_num if f_num >= threshold and m_num >= threshold: # we only consider the events where there are at least {thresohld} occurences for both gender odds_ratio[key] = round((m_num / f_num) / (non_m_num / non_f_num), 2) else: continue return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict( sorted(odds_ratio.items(), key=itemgetter(1))[:topk]) class Word_Extraction: def __init__(self, word_types=None): self.nlp = spacy.load("en_core_web_sm") self.matcher = Matcher(self.nlp.vocab) patterns = [] for word_type in word_types: if word_type == 'noun': patterns.append([{'POS':'NOUN'}]) elif word_type == 'adj': patterns.append([{'POS':'ADJ'}]) elif word_type == 'verb': patterns.append([{"POS": "VERB"}]) self.matcher.add("demo", patterns) def extract_word(self, doc): doc = self.nlp(doc) matches = self.matcher(doc) vocab = [] for match_id, start, end in matches: string_id = self.nlp.vocab.strings[match_id] # Get string representation span = doc[start:end] # The matched span vocab.append(span.text) return vocab def compute_lexical_content(list1, list2, threshold=10): noun_f, noun_m = [], [] adj_f, adj_m = [], [] len_f, len_m = [], [] noun_extract = Word_Extraction(['noun']) adj_extract = Word_Extraction(['adj']) ability_m, standout_m, ability_f, standout_f = 0, 0, 0, 0 masculine_m, feminine_m, masculine_f, feminine_f = 0, 0, 0, 0 for i in tqdm(range(len(list1)), ascii=True): noun_vocab_f = noun_extract.extract_word(list1[i]) # For normal analysis for v in noun_vocab_f: v = v.split()[0].replace('', '').replace('', '').replace('', '').replace('', '').replace('