Spaces:

smhavens
/

AnalogyArcade

Sleeping

File size: 22,441 Bytes

from datasets import load_dataset
import shutil
import json
from collections import defaultdict
import multiprocessing
import gensim
from sklearn.metrics import classification_report
from gensim import corpora
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import fasttext
from gensim.test.utils import datapath
from wefe.datasets import load_bingliu
from wefe.metrics import RNSB
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.utils import plot_queries_results, run_queries
import pandas as pd
import gensim.downloader as api
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from wefe.metrics import WEAT
from wefe.datasets import load_weat
from wefe.utils import run_queries
from wefe.utils import plot_queries_results
import random
from scipy.special import expit
import math
import sys
import os
import argparse
import nltk
import scipy.sparse
import numpy as np
import string
import io
from sklearn.model_selection import train_test_split


'''STEPS FOR CODE:
1. Train word embeddings on Simple English Wikipedia;
2. Compare these to other pre-trained embeddings;
3. Quantify biases that exist in these word embeddings;
4. Use your word embeddings as features in a simple text classifier;
'''


def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    # print("Hello", n, d)
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        # print(data)
        
    print(data)
    return data


def train_embeddings():
    '''TRAIN WORD EMBEDDINGS
    This will be making use of the dataset from wikipedia and the first step'''
    dataset = load_dataset("wikipedia", "20220301.simple")
    cores = multiprocessing.cpu_count()
    # check the first example of the training portion of the dataset :
    # print(dataset['train'][0])
    dataset_size = len(dataset)
    
    ### BUILD VOCAB ###
    # print(type(dataset["train"][0]))
    vocab = set()
    vocab_size = 0
    count = 0
    ## Generate vocab and split sentances and words?
    data = []
    for index, page in enumerate(dataset["train"]):
        document = page["text"]
        document = document.replace("\n", ". ")
        # print(document)
        for sent in document.split("."):
            # print("Sentance:", sent)
            new_sent = []
            clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
            clean_sent = "".join(clean_sent)
            for word in clean_sent.split(" "):
                if len(word) > 0:
                    new_word = word.lower()
                    # print("Word:", new_word)
                    if new_word[0] not in string.punctuation:
                        new_sent.append(new_word)
            if len(new_sent) > 0:
                data.append(new_sent)
                # print("New Sent:", new_sent)
    
    
    for index, page in enumerate(dataset["train"]):
        # print(page["text"])
        # for text in page:
        #     print(text)
        text = page["text"]
        clean_text = [s for s in text if s.isalnum() or s.isspace()]
        clean_text = "".join(clean_text)
        clean_text = clean_text.replace("\n", " ")
        # text = text.replace('; ', ' ').replace(", ", " ").replace("\n", " ").replace(":", " ").replace(". ", " ").replace("! ", " ").replace("? ", " ").replace()
        
        for word in clean_text.split(" "):
            # print(word)
            if word != "\n" and word != " " and word not in vocab:
                vocab.add(word)
                vocab_size += 1
            # if index == 10:
            #     break
            # print(f"word #{index}/{count} is {word}")
        count += 1
            
    # print(f"There are {vocab_size} vocab words")
    
    embeddings_model = Word2Vec(
                     data,
                     epochs= 10,
                     window=10,
                     vector_size= 50)
    embeddings_model.save("word2vec.model")
    
    skip_model = Word2Vec(
                     data,
                     epochs= 10,
                     window=10,
                     vector_size= 50,
                     sg=1)
    skip_model.save("skip2vec.model")
    
    embeddings_model = Word2Vec.load("word2vec.model")
    skip_model = Word2Vec.load("skip2vec.model")
    
    # embeddings_model.train(dataset, total_examples=dataset_size, epochs=15)
    # print(embeddings_model['train'])
    # print(embeddings_model.wv["france"])
    return embeddings_model, skip_model


def get_data():
    dataset = load_dataset("wikipedia", "20220301.simple")
    cores = multiprocessing.cpu_count()
    # check the first example of the training portion of the dataset :
    # print(dataset['train'][0])
    dataset_size = len(dataset)
    
    ### BUILD VOCAB ###
    # print(type(dataset["train"][0]))
    vocab = set()
    vocab_size = 0
    count = 0
    ## Generate vocab and split sentances and words?
    data = []
    num_sents = 0
    for index, page in enumerate(dataset["train"]):
        document = page["text"]
        document = document.replace("\n", ". ")
        # print(document)
        for sent in document.split("."):
            num_sents += 1
            # print("Sentance:", sent)
            new_sent = []
            clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
            clean_sent = "".join(clean_sent)
            for word in clean_sent.split(" "):
                if len(word) > 0:
                    new_word = word.lower()
                    # print("Word:", new_word)
                    if new_word[0] not in string.punctuation:
                        new_sent.append(new_word)
            if len(new_sent) > 0:
                data.append(new_sent)
                # print("New Sent:", new_sent)
                
    return data, num_sents


def compare_embeddings(cbow, skip, urban, fasttext):
    '''COMPARE EMBEDDINGS'''
    print("Most Similar to dog")
    print("cbow", cbow.wv.most_similar(positive=['dog'], negative=[], topn=2))
    print("skip", skip.wv.most_similar(positive=['dog'], negative=[], topn=2))
    print("urban", urban.most_similar(positive=['dog'], negative=[], topn=2))
    print("fasttext", fasttext.most_similar(positive=['dog'], negative=[], topn=2))
    
    print("\nMost Similar to Pizza - Pepperoni + Pretzel")
    print("cbow", cbow.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
    print("skip", skip.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
    print("urban", urban.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
    print("fasttext", fasttext.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
    
    print("\nMost Similar to witch - woman + man")
    print("cbow", cbow.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
    print("skip", skip.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
    print("urban", urban.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
    print("fasttext", fasttext.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
    
    print("\nMost Similar to mayor - town + country")
    print("cbow", cbow.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
    print("skip", skip.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
    print("urban", urban.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
    print("fasttext", fasttext.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
    
    print("\nMost Similar to death")
    print("cbow", cbow.wv.most_similar(positive=['death'], negative=[], topn=2))
    print("skip", skip.wv.most_similar(positive=['death'], negative=[], topn=2))
    print("urban", urban.most_similar(positive=['death'], negative=[], topn=2))
    print("fasttext", fasttext.most_similar(positive=['death'], negative=[], topn=2))


def quantify_bias(cbow, skip, urban, fasttext):
    '''QUANTIFY BIASES'''
    '''Using WEFE, RNSB'''
    
    RNSB_words = [
        ['christianity'],
        ['catholicism'],
        ['islam'],
        ['judaism'],
        ['hinduism'],
        ['buddhism'],
        ['mormonism'],
        ['scientology'],
        ['taoism']]
    
    weat_wordset = load_weat()
    
    models = [WordEmbeddingModel(cbow.wv, "CBOW"),
              WordEmbeddingModel(skip.wv, "skip-gram"),
              WordEmbeddingModel(urban, "urban dictionary"),
              WordEmbeddingModel(fasttext, "fasttext")]
    
    # Define the 10 Queries:
    # print(weat_wordset["science"])
    religions = ['christianity',
                 'catholicism',
                 'islam',
                 'judaism',
                 'hinduism',
                 'buddhism',
                 'mormonism',
                 'scientology',
                 'taoism',
                 'atheism']
    queries = [
        # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
        Query([religions, weat_wordset['arts']],
            [weat_wordset['career'], weat_wordset['family']],
            ['Religion', 'Art'], ['Career', 'Family']),
        
        Query([religions, weat_wordset['weapons']],
            [weat_wordset['male_terms'], weat_wordset['female_terms']],
            ['Religion', 'Weapons'], ['Male terms', 'Female terms']),

    ]

    wefe_results = run_queries(WEAT,
                                queries,
                                models,
                                metric_params ={
                                    'preprocessors': [
                                        {},
                                        {'lowercase': True }
                                    ]
                                },
                                warn_not_found_words = True
                                ).T.round(2)
    
    print(wefe_results)
    plot_queries_results(wefe_results).show()


def text_classifier(cbow):
    '''SIMPLE TEXT CLASSIFIER'''
    '''For each document, average together all embeddings for the
    individual words in that document to get a new, d-dimensional representation
    of that document (this is essentially a “continuous bag-of-words”). Note that
    your input feature size is only d now, instead of the size of your entire vocabulary.
    Compare the results of training a model using these “CBOW” input features to
    your original (discrete) BOW model.'''
    pos_train_files = glob.glob('aclImdb/train/pos/*')
    neg_train_files = glob.glob('aclImdb/train/neg/*')
    # print(pos_train_files[:5])
    
    num_files_per_class = 1000
    # bow_train_files = cbow
    all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
    # vectorizer = TfidfVectorizer(input="filename", stop_words="english")
    # vectors = vectorizer.fit_transform(all_train_files)
    d = len(cbow.wv["man"])
    vectors = np.empty([len(all_train_files), d])
    count = 0
    vocab = set()
    for doc in all_train_files:
        temp_array = avg_embeddings(doc, cbow, vocab)
        if len(temp_array) > 0:
            vectors[count] = temp_array
            count += 1
        else:
            vectors = np.delete(vectors, count)
    # vectors = np.array(avg_embeddings(doc, cbow) for doc in all_train_files)
    # print(vectors)
    # print(vocab)
    
    # len(vectorizer.vocabulary_)
    vectors[0].sum()
    # print("Vector at 0", vectors[0])
    
    X = vectors
    y = [1] * num_files_per_class + [0] * num_files_per_class
    len(y)
    
    x_0 = X[0]
    w = np.zeros(X.shape[1])
    # x_0_dense = x_0.todense()
    x_0.dot(w)
    
    w,b = sgd_for_lr_with_ce(X,y)
    # w
    
    # sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
    sorted_vocab = sorted(vocab)
    # sorted_vocab = [a for (a,b) in sorted_vocab]
    
    sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
    sorted_words_weights[-50:]

    preds = predict_y_lr(w,b,X)
    
    preds
    
    w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
    y_pred = predict_y_lr(w,b,X)
    print(classification_report(y, y_pred))

    # compute for dev set
    # pos_dev_files = glob.glob('aclImdb/test/pos/*')
    # neg_dev_files = glob.glob('aclImdb/test/neg/*')
    # num_dev_files_per_class = 100
    # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
    # # use the same vectorizer from before! otherwise features won't line up
    # # don't fit it again, just use it to transform!
    # X_dev = vectorizer.transform(all_dev_files)
    # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
    # # don't need new w and b, these are from out existing model
    # y_dev_pred = predict_y_lr(w,b,X_dev)
    # print(classification_report(y_dev, y_dev_pred))


def avg_embeddings(doc, model, vocab: set):
    words = []
    # remove out-of-vocabulary words
    with open(doc, "r") as file:
        for line in file:
            for word in line.split():
                words.append(word)
                vocab.add(word)
    words = [word for word in words if word in model.wv.index_to_key]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []
    


def sent_vec(sent, cbow):
    vector_size = cbow.wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in cbow.wv:
            ctr += 1
            wv_res += cbow.wv[w]
    wv_res = wv_res/ctr
    return wv_res


def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    # doc = nlp(sentence)



    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    # mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return 0


def cbow_classifier(cbow, data, num_sentances):
    vocab_len = len(cbow.wv.index_to_key)
    
    embeddings = []
    embedding_dict = {}
    vocab = set(cbow.wv.index_to_key)
    
    # print("Data len", len(data))
    # print("Data at 0", data[0])

    X_temp = np.empty([len(data), 1])
    X_train_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
                         for ls in data])
    X_test_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
                         for ls in data])
    
    # words = [word for word in words if word in cbow.wv.index_to_key]
    for word in vocab:
        # embedding[word] = cbow.wv[word]
        embeddings.append(np.mean(cbow.wv[word], axis=0))
        embedding_dict[word] = np.mean(cbow.wv[word], axis=0)
    
    X = embeddings
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
    
    # print(embeddings)
    # print(vocab_len)
    
    # X_train_vect_avg = []
    # for v in X_train_vect:
    #     if v.size:
    #         X_train_vect_avg.append(v.mean(axis=0))
    #     else:
    #         X_train_vect_avg.append(np.zeros(100, dtype=float))
            
    # X_test_vect_avg = []
    # for v in X_test_vect:
    #     if v.size:
    #         X_test_vect_avg.append(v.mean(axis=0))
    #     else:
    #         X_test_vect_avg.append(np.zeros(100, dtype=float))
            
    # # for i, v in enumerate(X_train_vect_avg):
    # #     print(len(data.iloc[i]), len(v))
    
    # x_0 = X_train_vect_avg[0]
    # num_files_per_class = 100
    # y = [1] * num_files_per_class + [0] * num_files_per_class
    # w = np.zeros(X_train_vect_avg.shape[1])
    # x_0_dense = x_0.todense()
    # x_0.dot(w)
    
    # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y)
    # w
    
    # sorted_vocab = sorted([(k,v) for k,v in enumerate(embedding_dict)],key=lambda x:x[1])
    # sorted_vocab = [a for (a,b) in sorted_vocab]
    
    # sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
    # sorted_words_weights[-50:]

    # preds = predict_y_lr(w,b,X_train_vect_avg)
    
    # preds
    
    # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y, num_passes=10)
    # y_pred = predict_y_lr(w,b,X_train_vect_avg)
    # print(classification_report(y, y_pred))

    # # compute for dev set
    # pos_dev_files = glob.glob('aclImdb/test/pos/*')
    # neg_dev_files = glob.glob('aclImdb/test/neg/*')
    # num_dev_files_per_class = 100
    # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
    # # use the same vectorizer from before! otherwise features won't line up
    # # don't fit it again, just use it to transform!
    # # X_dev = vectorizer.transform(all_dev_files)
    # # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
    # # # don't need new w and b, these are from out existing model
    # # y_dev_pred = predict_y_lr(w,b,X_dev)
    # # print(classification_report(y_dev, y_dev_pred))


def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):

    num_data_points = X.shape[0]

    # Initialize theta -> 0
    num_features = X.shape[1]
    w = np.zeros(num_features)
    b = 0.0

    # repeat until done
    # how to define "done"? let's just make it num passes for now
    # we can also do norm of gradient and when it is < epsilon (something tiny)
    # we stop

    for current_pass in range(num_passes):
        
        # iterate through entire dataset in random order
        order = list(range(num_data_points))
        random.shuffle(order)
        for i in order:

            # compute y-hat for this value of i given y_i and x_i
            x_i = X[i]
            y_i = y[i]

            # need to compute based on w and b
            # sigmoid(w dot x + b)
            z = x_i.dot(w) + b
            y_hat_i = expit(z)

            # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
            w = w - learning_rate * (y_hat_i - y_i) * x_i
            b = b - learning_rate * (y_hat_i - y_i)

    # return theta
    return w,b


def predict_y_lr(w,b,X,threshold=0.5):

    # use our matrix operation version of the logistic regression model
    # X dot w + b
    # need to make w a column vector so the dimensions line up correctly
    y_hat = X.dot( w.reshape((-1,1)) ) + b

    # then just check if it's > threshold
    preds = np.where(y_hat > threshold,1,0)

    return preds


def main():
    parser = argparse.ArgumentParser(
        prog='word_embedding',
        description='This program will train a word embedding model using simple wikipedia.',
        epilog='To skip training the model and to used the saved model "word2vec.model", use the command --skip or -s.'
    )
    parser.add_argument('-s', '--skip', action='store_true')
    parser.add_argument('-e', '--extra', action='store_true')
    parser.add_argument('-b', '--bias', action='store_true')
    parser.add_argument('-c', '--compare', action='store_true')
    parser.add_argument('-t', '--text', action='store_true')
    
    args = parser.parse_args()
    skip_model = None
    cbow_model = None
    ud_model = None
    wiki_model = None
    if args.compare:
        if args.skip:
            # print("Skipping")
            cbow_model = Word2Vec.load("word2vec.model")
            skip_model = Word2Vec.load("skip2vec.model")
            ud_model = KeyedVectors.load("urban2vec.model")
            wiki_model = KeyedVectors.load("wiki2vec.model")
        elif args.extra:
            # print("Extra mode")
            cbow_model = Word2Vec.load("word2vec.model")
            skip_model = Word2Vec.load("skip2vec.model")
            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False) 
            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False) 
            wiki_model.save("wiki2vec.model")
            ud_model.save("urban2vec.model")
        else:
            cbow_model, skip_model = train_embeddings()
            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
            wiki_model.save("wiki2vec.model")
            ud_model.save("urban2vec.model")
        compare_embeddings(cbow_model, skip_model, ud_model, wiki_model)
    if args.bias:
        if args.skip:
            # print("Skipping")
            cbow_model = Word2Vec.load("word2vec.model")
            skip_model = Word2Vec.load("skip2vec.model")
            ud_model = KeyedVectors.load("urban2vec.model")
            wiki_model = KeyedVectors.load("wiki2vec.model")
        elif args.extra:
            # print("Extra mode")
            cbow_model = Word2Vec.load("word2vec.model")
            skip_model = Word2Vec.load("skip2vec.model")
            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False) 
            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False) 
            wiki_model.save("wiki2vec.model")
            ud_model.save("urban2vec.model")
        else:
            cbow_model, skip_model = train_embeddings()
            wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
            ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
            wiki_model.save("wiki2vec.model")
            ud_model.save("urban2vec.model")
        quantify_bias(cbow_model, skip_model, ud_model, wiki_model)
    if args.text:
        if args.skip:
            # print("Skipping")
            cbow_model = Word2Vec.load("word2vec.model")
        else:
            cbow_model, skip_model = train_embeddings()
            
        text_classifier(cbow_model)
        # data, sents = get_data()
        # cbow_classifier(cbow_model, data, sents)
        
    # print("No errors?")
    

if __name__ == "__main__":
    main()