|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import warnings |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.preprocessing import MultiLabelBinarizer |
|
|
|
|
|
warnings.filterwarnings(action="ignore", category=UserWarning) |
|
|
|
np.seterr(divide="ignore") |
|
|
|
|
|
pd.set_option("use_inf_as_na", True) |
|
|
|
logs = logging.getLogger(__name__) |
|
logs.setLevel(logging.INFO) |
|
logs.propagate = False |
|
|
|
if not logs.handlers: |
|
|
|
Path("./log_files").mkdir(exist_ok=True) |
|
|
|
|
|
file = logging.FileHandler("./log_files/npmi.log") |
|
fileformat = logging.Formatter("%(asctime)s:%(message)s") |
|
file.setLevel(logging.INFO) |
|
file.setFormatter(fileformat) |
|
|
|
|
|
stream = logging.StreamHandler() |
|
streamformat = logging.Formatter("[data_measurements_tool] %(message)s") |
|
stream.setLevel(logging.WARNING) |
|
stream.setFormatter(streamformat) |
|
|
|
logs.addHandler(file) |
|
logs.addHandler(stream) |
|
|
|
_NUM_BATCHES = 500 |
|
|
|
|
|
class nPMI: |
|
|
|
def __init__( |
|
self, |
|
vocab_counts_df, |
|
tokenized_df, |
|
tokenized_col_name="tokenized_text", |
|
num_batches=_NUM_BATCHES, |
|
): |
|
logs.info("Initiating npmi class.") |
|
logs.info("vocab is") |
|
logs.info(vocab_counts_df) |
|
self.vocab_counts_df = vocab_counts_df |
|
logs.info("tokenized is") |
|
self.tokenized_df = tokenized_df |
|
logs.info(self.tokenized_df) |
|
self.tokenized_col_name = tokenized_col_name |
|
|
|
self.mlb_list = [] |
|
|
|
def binarize_words_in_sentence(self): |
|
logs.info("Creating co-occurrence matrix for PMI calculations.") |
|
batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int) |
|
i = 0 |
|
|
|
while i < len(batches) - 1: |
|
|
|
|
|
mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index) |
|
logs.info( |
|
"%s of %s sentence binarize batches." % (str(i), str(len(batches))) |
|
) |
|
|
|
mlb_series = mlb.fit_transform( |
|
self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]] |
|
) |
|
i += 1 |
|
self.mlb_list.append(mlb_series) |
|
|
|
def calc_cooccurrences(self, subgroup, subgroup_idx): |
|
initialize = True |
|
coo_df = None |
|
|
|
logs.info( |
|
"Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences." |
|
) |
|
if not self.mlb_list: |
|
self.binarize_words_in_sentence() |
|
for batch_id in range(len(self.mlb_list)): |
|
logs.info( |
|
"%s of %s co-occurrence count batches" |
|
% (str(batch_id), str(len(self.mlb_list))) |
|
) |
|
|
|
batch_sentence_row = self.mlb_list[batch_id] |
|
|
|
sent_batch_df = pd.DataFrame(batch_sentence_row) |
|
|
|
|
|
|
|
subgroup_df = sent_batch_df[subgroup_idx] |
|
subgroup_df.columns = [subgroup] |
|
|
|
|
|
subgroup_df = subgroup_df[subgroup_df > 0] |
|
logs.info("Removing 0 counts, subgroup_df is") |
|
logs.info(subgroup_df) |
|
mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0] |
|
logs.info("mlb subgroup only is") |
|
logs.info(mlb_subgroup_only) |
|
|
|
logs.info("Now we do the T.dot approach for co-occurrences") |
|
batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df)) |
|
|
|
|
|
|
|
if initialize: |
|
coo_df = batch_coo_df |
|
else: |
|
coo_df = coo_df.add(batch_coo_df, fill_value=0) |
|
logs.info("coo_df is") |
|
logs.info(coo_df) |
|
initialize = False |
|
logs.info("Returning co-occurrence matrix") |
|
logs.info(coo_df) |
|
return pd.DataFrame(coo_df) |
|
|
|
def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict): |
|
""" |
|
Calculates nPMI metrics between paired subgroups. |
|
Special handling for a subgroup paired with itself. |
|
:param subgroup_npmi_dict: |
|
:return: |
|
""" |
|
paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}} |
|
|
|
subgroup1, subgroup2 = sorted(subgroup_pair) |
|
vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1] |
|
logs.info("vocab cooc") |
|
logs.info(vocab_cooc_df1) |
|
if subgroup1 == subgroup2: |
|
shared_npmi_df = npmi_df1 |
|
shared_pmi_df = pmi_df1 |
|
shared_vocab_cooc_df = vocab_cooc_df1 |
|
else: |
|
vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2] |
|
logs.info("vocab cooc2") |
|
logs.info(vocab_cooc_df2) |
|
|
|
shared_npmi_df = npmi_df1.join( |
|
npmi_df2, how="inner", lsuffix="1", rsuffix="2" |
|
) |
|
shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2") |
|
shared_vocab_cooc_df = vocab_cooc_df1.join( |
|
vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2" |
|
) |
|
shared_vocab_cooc_df = shared_vocab_cooc_df.dropna() |
|
shared_vocab_cooc_df = shared_vocab_cooc_df[ |
|
shared_vocab_cooc_df.index.notnull() |
|
] |
|
logs.info("shared npmi df") |
|
logs.info(shared_npmi_df) |
|
logs.info("shared vocab df") |
|
logs.info(shared_vocab_cooc_df) |
|
npmi_bias = ( |
|
shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"] |
|
) |
|
paired_results_dict["npmi-bias"] = npmi_bias.dropna() |
|
paired_results_dict["npmi"] = shared_npmi_df.dropna() |
|
paired_results_dict["pmi"] = shared_pmi_df.dropna() |
|
paired_results_dict["count"] = shared_vocab_cooc_df.dropna() |
|
return paired_results_dict |
|
|
|
def calc_metrics(self, subgroup): |
|
|
|
subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup) |
|
logs.info("Calculating co-occurrences...") |
|
df_coo = self.calc_cooccurrences(subgroup, subgroup_idx) |
|
vocab_cooc_df = self.set_idx_cols(df_coo, subgroup) |
|
logs.info(vocab_cooc_df) |
|
logs.info("Calculating PMI...") |
|
pmi_df = self.calc_PMI(vocab_cooc_df, subgroup) |
|
logs.info(pmi_df) |
|
logs.info("Calculating nPMI...") |
|
npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup) |
|
logs.info(npmi_df) |
|
return vocab_cooc_df, pmi_df, npmi_df |
|
|
|
def set_idx_cols(self, df_coo, subgroup): |
|
""" |
|
:param df_coo: Co-occurrence counts for subgroup, length is num_words |
|
:return: |
|
""" |
|
count_df = df_coo.set_index(self.vocab_counts_df.index) |
|
count_df.columns = [subgroup + "-count"] |
|
count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int) |
|
return count_df |
|
|
|
def calc_PMI(self, vocab_cooc_df, subgroup): |
|
""" |
|
# PMI(x;y) = h(y) - h(y|x) |
|
# = h(subgroup) - h(subgroup|word) |
|
# = log (p(subgroup|word) / p(subgroup)) |
|
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) |
|
""" |
|
|
|
subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"] |
|
|
|
|
|
|
|
p_subgroup_g_word = ( |
|
vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"] |
|
) |
|
logs.info("p_subgroup_g_word is") |
|
logs.info(p_subgroup_g_word) |
|
pmi_df = pd.DataFrame() |
|
pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob) |
|
|
|
|
|
|
|
|
|
return pmi_df.dropna() |
|
|
|
def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup): |
|
""" |
|
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) |
|
# = -log(p(word|subgroup)p(word)) |
|
""" |
|
p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum( |
|
vocab_cooc_df[subgroup + "-count"] |
|
) |
|
p_word = pmi_df.apply( |
|
lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1 |
|
) |
|
normalize_pmi = -np.log(p_word_g_subgroup * p_word) |
|
npmi_df = pd.DataFrame() |
|
npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi |
|
return npmi_df.dropna() |
|
|