{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eeaa927c-b8ef-4ee5-ab03-8257899152fd",
   "metadata": {},
   "source": [
    "# Building Dynamic Wordlists from WordNet as a fallback\n",
    "\n",
    "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out.\n",
    "\n",
    "# Builds a dataframe dynamically from WordNet using NLTK.\n",
    "def wordnet_df(word,POS=False,seed_definition=None):\n",
    "    pos_options = ['NOUN','VERB','ADJ','ADV']\n",
    "    synonyms, antonyms = syn_ant(word,POS,False)\n",
    "    #print(synonyms, antonyms) #for QA purposes\n",
    "    words = []\n",
    "    cats = []\n",
    "    #WordNet hates spaces so you have to remove them\n",
    "    m_word = word.replace(\" \", \"_\")\n",
    "    \n",
    "    #Allow the user to pick a seed definition if it is not provided directly to the function.\n",
    "    if seed_definition is None:\n",
    "        if POS in pos_options:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n",
    "        else:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n",
    "        for d in range(len(seed_definitions)):\n",
    "            print(f\"{d}: {seed_definitions[d]}\")\n",
    "        choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n",
    "        seed_definition = seed_definitions[choice]\n",
    "    \n",
    "    if POS in pos_options:\n",
    "        for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n",
    "                if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n",
    "                    cur_lemmas = syn.lemmas()\n",
    "                    hypos = syn.hyponyms()\n",
    "                    for hypo in hypos:\n",
    "                        cur_lemmas.extend(hypo.lemmas())\n",
    "                    for lemma in cur_lemmas:\n",
    "                        ll = lemma.name()\n",
    "                        cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                        words.append(re.sub(\"_\",\" \",ll))\n",
    "\n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "    else:\n",
    "        for syn in wordnet.synsets(m_word):\n",
    "            if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n",
    "                cur_lemmas = syn.lemmas()\n",
    "                hypos = syn.hyponyms()\n",
    "                for hypo in hypos:\n",
    "                    cur_lemmas.extend(hypo.lemmas())\n",
    "                for lemma in cur_lemmas:\n",
    "                    ll = lemma.name()\n",
    "                    cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                    words.append(re.sub(\"_\",\" \",ll))        \n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "\n",
    "    df = {\"Categories\":cats, \"Words\":words}\n",
    "    df = pd.DataFrame(df) \n",
    "    df = df.drop_duplicates().reset_index()\n",
    "    df = df.drop(\"index\", axis=1)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4048815e-8434-4db9-bbb2-652fe0076df3",
   "metadata": {},
   "source": [
    "# Building Dynamic Wordlists from WordNet as a fallback\n",
    "\n",
    "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41374b5c-12c0-4e4d-aa73-20db04b280ff",
   "metadata": {},
   "source": [
    "# Building Dynamic Wordlists from WordNet as a fallback\n",
    "\n",
    "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "26a97377-67be-4903-9bfa-e8660aeb8c90",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import necessary libraries.\n",
    "import re, nltk, pandas as pd, numpy as np, ssl\n",
    "from nltk.corpus import wordnet\n",
    "import spacy\n",
    "nlp = spacy.load(\"en_core_web_lg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "42e7a838-bb82-4736-8f70-127c53fea68b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package omw-1.4 to\n",
      "[nltk_data]     /Users/nbutters/nltk_data...\n",
      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#If an error is thrown that the corpus \"omw-1.4\" isn't discoverable you can use this code. (https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)\n",
    "try:\n",
    "    _create_unverified_https_context = ssl._create_unverified_context\n",
    "except AttributeError:\n",
    "    pass\n",
    "else:\n",
    "    ssl._create_default_https_context = _create_unverified_https_context\n",
    "    \n",
    "nltk.download('omw-1.4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "14918489-e5fe-4898-8d4a-8bc0f7b1d9e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Synset('bantam.s.01')]\n",
      "16 [Synset('bantam.n.01'), Synset('bantam.s.01'), Synset('diminutive.n.01'), Synset('bantam.s.01'), Synset('lilliputian.n.01'), Synset('lilliputian.n.02'), Synset('lilliputian.a.01'), Synset('bantam.s.01'), Synset('fiddling.s.01'), Synset('dwarf.n.01'), Synset('bantam.s.01'), Synset('petite.n.01'), Synset('bantam.s.01'), Synset('bantam.s.01'), Synset('flyspeck.n.01'), Synset('bantam.s.01')]\n"
     ]
    }
   ],
   "source": [
    "hypos = wordnet.synsets(\"tiny\")\n",
    "print(hypos)\n",
    "new_list = []\n",
    "for syn in hypos:\n",
    "    cur_lemmas = syn.lemmas()\n",
    "    hypos = syn.hyponyms()\n",
    "    for hypo in hypos:\n",
    "        cur_lemmas.extend(hypo.lemmas())\n",
    "    for lemma in cur_lemmas:\n",
    "        ll = lemma.name()\n",
    "        new_list.append(ll)\n",
    "syns = []\n",
    "for lemma in new_list:\n",
    "    syns.extend(wordnet.synsets(lemma))\n",
    "print(len(syns),syns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3047d11-0512-41af-9db7-62daa8cbb60d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Here I define a few test sentences from the Duct-Tape-Pipeline.\n",
    "upt1 = \"I like movies starring black actors.\"\n",
    "upt2 = \"I am a black trans-woman.\"\n",
    "upt3 = \"Native Americans deserve to have their land back.\"\n",
    "upt4 = \"This movie was filmed in Iraq.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b52425b5-2c4d-4a31-a240-feabc319198b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A simple function to pull synonyms and antonyms using spacy's POS\n",
    "def syn_ant(word,POS=False,human=True):\n",
    "    pos_options = ['NOUN','VERB','ADJ','ADV']\n",
    "    synonyms = [] \n",
    "    antonyms = []\n",
    "    #WordNet hates spaces so you have to remove them\n",
    "    if \" \" in word:\n",
    "        word = word.replace(\" \", \"_\")\n",
    "    \n",
    "    if POS in pos_options:\n",
    "        for syn in wordnet.synsets(word, pos=getattr(wordnet, POS)): \n",
    "            for l in syn.lemmas(): \n",
    "                current = l.name()\n",
    "                if human:\n",
    "                    current = re.sub(\"_\",\" \",current)\n",
    "                synonyms.append(current) \n",
    "                if l.antonyms():\n",
    "                    for ant in l.antonyms():\n",
    "                        cur_ant = ant.name()\n",
    "                        if human:\n",
    "                            cur_ant = re.sub(\"_\",\" \",cur_ant)\n",
    "                        antonyms.append(cur_ant)\n",
    "    else: \n",
    "        for syn in wordnet.synsets(word): \n",
    "            for l in syn.lemmas(): \n",
    "                current = l.name()\n",
    "                if human:\n",
    "                    current = re.sub(\"_\",\" \",current)\n",
    "                synonyms.append(current) \n",
    "                if l.antonyms():\n",
    "                    for ant in l.antonyms():\n",
    "                        cur_ant = ant.name()\n",
    "                        if human:\n",
    "                            cur_ant = re.sub(\"_\",\" \",cur_ant)\n",
    "                        antonyms.append(cur_ant)\n",
    "    synonyms = list(set(synonyms))\n",
    "    antonyms = list(set(antonyms))\n",
    "    return synonyms, antonyms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "7cd10cf1-bf0d-4baa-8588-9315cfbe760e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['man', \"gentleman's gentleman\", 'Isle of Man', 'Man', 'humanity', 'human', 'piece', 'valet de chambre', 'mankind', 'humans', 'military personnel', 'adult male', 'homo', 'human race', 'valet', 'humankind', 'military man', 'human being', 'serviceman', 'world', 'gentleman', 'human beings'] ['woman', 'civilian']\n"
     ]
    }
   ],
   "source": [
    "x, q = syn_ant(\"man\")\n",
    "print(x,q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b15bcba-ca91-49a0-b873-aff1e61b3053",
   "metadata": {},
   "outputs": [],
   "source": [
    "doc1 = nlp(upt1)\n",
    "doc2 = nlp(upt2)\n",
    "doc3 = nlp(upt3)\n",
    "doc4 = nlp(upt4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1220c67b-1776-4a39-8335-c88b96379122",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_ant(doc3[0].text,doc3[0].pos_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cc3ada2-90c1-4ecf-b704-9c4bf5146406",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Discovering that NLTk WordNet uses \"_\" for compounds... and fixed it.\n",
    "syn_ant(\"Native_American\", \"NOUN\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52253e37-5eb1-42f8-a4b8-046542004349",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_ant(\"Papua_New_Guinea\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14eb3c2f-2a5a-4db0-a802-172d9902df70",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_ant(\"hate\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53a3d7af-980a-47bc-b70f-65c1411fba05",
   "metadata": {},
   "outputs": [],
   "source": [
    "russian = wordnet.synset('mother.n.01')\n",
    "print(russian.hyponyms())\n",
    "hypos = []\n",
    "[hypos.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hyponyms.lemmas()]) for hyponyms in russian.hyponyms()]\n",
    "hypos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd60bca8-96a9-45f5-8c60-e7177ead3f35",
   "metadata": {},
   "outputs": [],
   "source": [
    "hyper_list = wordnet.synset('woman.n.01')\n",
    "print(hyper_list.hypernyms())\n",
    "hypers = []\n",
    "[hypers.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypernyms.lemmas()]) for hypernyms in hyper_list.hypernyms()]\n",
    "hypers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b3f4a7f-a4a6-4862-a321-42e1d4be406a",
   "metadata": {},
   "outputs": [],
   "source": [
    "hyper_list = wordnet.synset('man.n.01')\n",
    "print(hyper_list.hypernyms())\n",
    "hypers = []\n",
    "[hypers.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypernyms.lemmas()]) for hypernyms in hyper_list.hypernyms()]\n",
    "hypers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9932230-e38d-444d-9814-8668d4bf596c",
   "metadata": {},
   "outputs": [],
   "source": [
    "parent = wordnet.synset('male.n.02')\n",
    "print(parent.hyponyms())\n",
    "hypos = []\n",
    "[hypos.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hyponyms.lemmas()]) for hyponyms in parent.hyponyms()]\n",
    "hypos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51bf50a7-87e0-485e-b055-b5a59c44db06",
   "metadata": {},
   "outputs": [],
   "source": [
    "hypo2 = [[re.sub(\"_\",\" \",lemma.name()) for lemma in hyponym.lemmas()] for hyponym in parent.hyponyms()]\n",
    "hypo2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96cce82d-854a-4c3f-b2a1-d8d224357b1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "syn_ant(\"white supremacist\",\"NOUN\",human=False)"
   ]
  },
  {
   "cell_type": "raw",
   "id": "f6cc0a50-ec83-4951-a9bf-86e89e334945",
   "metadata": {},
   "source": [
    "## Here's an attempt to explore ConceptNet\n",
    "# I have currently commented it out because it is not as useful for where I'm trying to go.\n",
    "'''This is an attempt to use [ConceptNet](https://conceptnet.io/), specifically calling their API ([see documentation](https://github.com/commonsense/conceptnet5/wiki/API)). If I can figure out how to build a list of synonyms and antonyms from here then it may represent a good way to set defaults.\n",
    "\n",
    "#import the necessary library\n",
    "import requests\n",
    "\n",
    "obj = requests.get('http://api.conceptnet.io/c/en/black').json()\n",
    "obj.keys()'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "310dfd86-d2df-4023-85bc-bed22099b890",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Builds a list dynamically from WordNet using NLTK.\n",
    "def wordnet_list(word,POS=False):\n",
    "    pos_options = ['NOUN','VERB','ADJ','ADV']\n",
    "    synonyms, antonyms = syn_ant(word,POS,False)\n",
    "    base = []\n",
    "    final = [word]\n",
    "    #WordNet hates spaces so you have to remove them\n",
    "    m_word = word.replace(\" \", \"_\")\n",
    "    \n",
    "    if POS in pos_options:\n",
    "        for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n",
    "            base.extend(syn.hyponyms())\n",
    "            base.append(syn)\n",
    "        \n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n",
    "                    base.extend(syn.hyponyms())\n",
    "                    base.append(syn)\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n",
    "                    base.extend(syn.hyponyms())\n",
    "                    base.append(syn)\n",
    "    else:\n",
    "        for syn in wordnet.synsets(m_word):\n",
    "            base.extend(syn.hyponyms())\n",
    "            base.append(syn)\n",
    "        \n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w):\n",
    "                    base.extend(syn.hyponyms())\n",
    "                    base.append(syn)\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a):\n",
    "                    base.extend(syn.hyponyms())\n",
    "                    base.append(syn)\n",
    "    base = list(set(base))\n",
    "    for b in base:\n",
    "        cur_words = []\n",
    "        cur_words.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in b.lemmas()])\n",
    "        final.extend(cur_words)\n",
    "\n",
    "        \n",
    "                \n",
    "    final = list(set(final))    \n",
    "    return final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "331ad5b4-15da-454f-8b3d-9fe7b131a6d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "wordnet_list(\"white supremacist\", \"NOUN\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "866e3ba9-6213-4725-9627-2b5054f996e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "words = wordnet_list(\"girl\", \"NOUN\")\n",
    "print(f\"The length of the list is {len(words)}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d284ea9c-71d2-4860-882e-64b280d0d699",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"The girl was brought to the front of the class.\"\n",
    "test_doc = nlp(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f53cbe5f-50db-4b2c-b59a-66864669b244",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df[\"Words\"] = words\n",
    "\n",
    "df[\"Sentences\"] = df.Words.apply(lambda x: text.replace(\"girl\",x))\n",
    "\n",
    "df[\"Similarity\"] = df.Words.apply(lambda x: nlp(\"girl\").similarity(nlp(x)[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b4e37dd-f899-47c9-93ea-f92898760819",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.sort_values(by='Similarity', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27f912c6-bfa9-4604-8d9d-f4502a0f0ea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df[df.Similarity > 0].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59e35f36-f434-4377-a170-d109ca89dd77",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c754d73-9d74-471d-a36f-84a404aa7093",
   "metadata": {},
   "outputs": [],
   "source": [
    "minimum = df2.Similarity.min()\n",
    "text2 = df2.loc[df2['Similarity'] == minimum, 'Words'].iloc[0]\n",
    "text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2853e02-faea-4ce8-800c-70cc6273be02",
   "metadata": {},
   "outputs": [],
   "source": [
    "maximum = df2[df2.Words != \"girl\"].Similarity.max()\n",
    "text3 = df2.loc[df2['Similarity'] == maximum, 'Words'].iloc[0]\n",
    "text3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f03e090-6c99-41db-b013-a77f2fec6e4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = df2[df.Similarity > .5].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bab0a7f-27c7-4dbf-a92b-b0c13d25e5b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "homo = wordnet.synsets('gay')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a1130a9-b5aa-47b4-9e33-738b08f92c7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for syn in homo:\n",
    "    print(syn.lemmas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6e48617-aefb-46f6-9961-21da7f81c8d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "mother = wordnet.synsets('homo')\n",
    "cats = []\n",
    "words = []\n",
    "for syn in mother:\n",
    "    lemmas = syn.lemmas()\n",
    "    for lemma in lemmas:\n",
    "        ll = lemma.name()\n",
    "        print(ll)\n",
    "        cats.append(syn.name().split(\".\")[0])\n",
    "        words.append(ll)\n",
    "        \n",
    "print(cats,words)\n",
    "print(len(cats),len(words))\n",
    "df = {\"Categories\":cats, \"Words\":words}\n",
    "df = pd.DataFrame(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d55ba70-a569-429e-88d0-84f99772b9be",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "097f927b-28e6-4d10-99a2-621bb758bb77",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_text(text):\n",
    "    doc = nlp(text.lower())\n",
    "    result = []\n",
    "    for token in doc:\n",
    "        if (token.is_stop) or (token.is_punct) or (token.lemma_ == '-PRON-'):\n",
    "            continue\n",
    "        result.append(token.lemma_)\n",
    "    return \" \".join(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18b4469e-4457-405e-9736-58ab9e8d8ac6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_definition(syn):\n",
    "    #This function removes stop words from sentences to improve on document level similarity for differentiation.\n",
    "    if type(syn) is str:\n",
    "        synset = wordnet.synset(syn).definition()\n",
    "    elif type(syn) is nltk.corpus.reader.wordnet.Synset:\n",
    "        synset = syn.definition()\n",
    "    definition = nlp(\" \".join(token.lemma_ for token in nlp(synset) if not token.is_stop))\n",
    "    return definition\n",
    "\n",
    "def check_sim(a,b):\n",
    "    if type(a) is str and type(b) is str:\n",
    "        a = nlp(a)\n",
    "        b = nlp(b)\n",
    "    similarity = a.similarity(b)\n",
    "    return similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed2323c6-cee1-4d6b-8d33-a53755036acd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Builds a dataframe dynamically from WordNet using NLTK.\n",
    "def wordnet_df(word,POS=False,seed_definition=None):\n",
    "    pos_options = ['NOUN','VERB','ADJ','ADV']\n",
    "    synonyms, antonyms = syn_ant(word,POS,False)\n",
    "    #print(synonyms, antonyms) #for QA purposes\n",
    "    words = []\n",
    "    cats = []\n",
    "    #WordNet hates spaces so you have to remove them\n",
    "    m_word = word.replace(\" \", \"_\")\n",
    "    \n",
    "    #Allow the user to pick a seed definition if it is not provided directly to the function.\n",
    "    if seed_definition is None:\n",
    "        if POS in pos_options:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n",
    "        else:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n",
    "        for d in range(len(seed_definitions)):\n",
    "            print(f\"{d}: {seed_definitions[d]}\")\n",
    "        choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n",
    "        seed_definition = seed_definitions[choice]\n",
    "    \n",
    "    if POS in pos_options:\n",
    "        for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n",
    "                if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n",
    "                    cur_lemmas = syn.lemmas()\n",
    "                    hypos = syn.hyponyms()\n",
    "                    for hypo in hypos:\n",
    "                        cur_lemmas.extend(hypo.lemmas())\n",
    "                    for lemma in cur_lemmas:\n",
    "                        ll = lemma.name()\n",
    "                        cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                        words.append(re.sub(\"_\",\" \",ll))\n",
    "\n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "    else:\n",
    "        for syn in wordnet.synsets(m_word):\n",
    "            if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n",
    "                cur_lemmas = syn.lemmas()\n",
    "                hypos = syn.hyponyms()\n",
    "                for hypo in hypos:\n",
    "                    cur_lemmas.extend(hypo.lemmas())\n",
    "                for lemma in cur_lemmas:\n",
    "                    ll = lemma.name()\n",
    "                    cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                    words.append(re.sub(\"_\",\" \",ll))        \n",
    "        if len(synonyms) > 0:\n",
    "            for w in synonyms:\n",
    "                w = w.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(w):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "        if len(antonyms) > 0:\n",
    "            for a in antonyms:\n",
    "                a = a.replace(\" \",\"_\")\n",
    "                for syn in wordnet.synsets(a):\n",
    "                    if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n",
    "                        cur_lemmas = syn.lemmas()\n",
    "                        hypos = syn.hyponyms()\n",
    "                        for hypo in hypos:\n",
    "                            cur_lemmas.extend(hypo.lemmas())\n",
    "                        for lemma in cur_lemmas:\n",
    "                            ll = lemma.name()\n",
    "                            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "                            words.append(re.sub(\"_\",\" \",ll))\n",
    "\n",
    "    df = {\"Categories\":cats, \"Words\":words}\n",
    "    df = pd.DataFrame(df) \n",
    "    df = df.drop_duplicates().reset_index()\n",
    "    df = df.drop(\"index\", axis=1)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e9224f8-9620-464d-8a27-6b5b2ff27983",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_mother = wordnet_df(\"gay\")\n",
    "df_mother"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "003c8941-77e9-45de-a1f5-bd3ac8b6d4a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_mother)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b196716-ee0d-479b-922c-9bac711dd535",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = wordnet.synsets(\"mother\",wordnet.NOUN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bebb801e-6e43-463f-8a3d-7a09b709836e",
   "metadata": {},
   "outputs": [],
   "source": [
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9a87480-1e25-4614-95fc-5aa201efb9c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "test1 = wordnet.synsets('father',wordnet.NOUN)\n",
    "testx = wordnet.synset(\"mother.n.01\")\n",
    "for syn in test1:\n",
    "    definition = clean_definition(syn)\n",
    "    test_def = clean_definition(testx)\n",
    "    print(test_def)\n",
    "    print(syn, definition, check_sim(process_text(test_def.text),process_text(definition.text)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1757d434-5f67-465a-9559-34ce2eacf1f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = \"colonizer.n.01\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea8eb48a-49dd-4aae-b5ae-ae0db2bddc49",
   "metadata": {},
   "outputs": [],
   "source": [
    "test2 = \"mother.n.01\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b99656d-efab-4c6e-8dca-0d604cbd5bbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "mother = nlp(wordnet.synset(\"black.n.05\").definition())\n",
    "print(mother)\n",
    "colony = nlp(wordnet.synset(\"white.n.01\").definition())\n",
    "print(colony)\n",
    "print(mother.similarity(colony))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b735efc4-0c84-4632-b850-7395f27971fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "mother_processed = nlp(process_text(mother.text))\n",
    "colony_processed = nlp(process_text(colony.text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5972e22-ae21-4b6a-9c1a-cbd907c8aef1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(mother_processed.similarity(colony_processed))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1046cc13-ea10-4bb1-bd59-daa1507c5c19",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = clean_definition(test)\n",
    "\n",
    "b = clean_definition(test2)\n",
    "\n",
    "a.similarity(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9f46e35-2bc9-4937-bb5f-ce6d268150af",
   "metadata": {},
   "outputs": [],
   "source": [
    "a_p = nlp(process_text(a.text))\n",
    "b_p = nlp(process_text(b.text))\n",
    "a_p.similarity(b_p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "451815b5-3fa8-48c4-a89a-7b70d19d00da",
   "metadata": {},
   "outputs": [],
   "source": [
    "check_sim(a,b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98519912-fe93-4b9a-85cb-9d7001735627",
   "metadata": {},
   "outputs": [],
   "source": [
    "test3 = wordnet.synset(\"white_supremacist.n.01\")\n",
    "c = clean_definition(test3)\n",
    "a.similarity(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d44f7ee-f7a7-421e-9cb5-d0f599c2b9ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parallel(word, seed_definition, QA=False):\n",
    "    cleaned = nlp(process_text(seed_definition))\n",
    "    root_syns = wordnet.synsets(word)\n",
    "    hypers = []\n",
    "    new_hypos = []\n",
    "    \n",
    "    for syn in root_syns:\n",
    "        hypers.extend(syn.hypernyms())\n",
    "    \n",
    "    #hypers = list(set([syn for syn in hypers if cleaned.similarity(nlp(process_text(syn.definition()))) >=.5]))\n",
    "    \n",
    "    for syn in hypers:\n",
    "        new_hypos.extend(syn.hyponyms())\n",
    "    \n",
    "    hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.75]))\n",
    "    print(len(hypos))\n",
    "    if len(hypos) < 3:\n",
    "        hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.5]))\n",
    "    elif len(hypos) <10:\n",
    "        hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.66]))\n",
    "    elif len(hypos) >= 10: \n",
    "        hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.8]))\n",
    "    elif len(hypos) >= 20:\n",
    "        hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.9]))\n",
    "    if QA:\n",
    "        print(hypers)\n",
    "        print(hypos)\n",
    "        return hypers, hypos\n",
    "    else:\n",
    "        return hypos\n",
    "\n",
    "# Builds a dataframe dynamically from WordNet using NLTK.\n",
    "def wordnet_parallel_df(word,POS=False,seed_definition=None):\n",
    "    pos_options = ['NOUN','VERB','ADJ','ADV']\n",
    "    synonyms, antonyms = syn_ant(word,POS,False)\n",
    "    #print(synonyms, antonyms) #for QA purposes\n",
    "    words = []\n",
    "    cats = []\n",
    "    #WordNet hates spaces so you have to remove them\n",
    "    m_word = word.replace(\" \", \"_\")\n",
    "    \n",
    "    #Allow the user to pick a seed definition if it is not provided directly to the function.\n",
    "    if seed_definition is None:\n",
    "        if POS in pos_options:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n",
    "        else:\n",
    "            seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n",
    "        for d in range(len(seed_definitions)):\n",
    "            print(f\"{d}: {seed_definitions[d]}\")\n",
    "        choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n",
    "        seed_definition = seed_definitions[choice]\n",
    "    \n",
    "    hypos = get_parallel(m_word,seed_definition)\n",
    "    for syn,sim in hypos:\n",
    "        cur_lemmas = syn.lemmas()\n",
    "        hypos = syn.hyponyms()\n",
    "        for hypo in hypos:\n",
    "            cur_lemmas.extend(hypo.lemmas())\n",
    "        for lemma in cur_lemmas:\n",
    "            ll = lemma.name()\n",
    "            cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n",
    "            words.append(re.sub(\"_\",\" \",ll))\n",
    "\n",
    "    df = {\"Categories\":cats, \"Words\":words}\n",
    "    df = pd.DataFrame(df) \n",
    "    df = df.drop_duplicates().reset_index()\n",
    "    df = df.drop(\"index\", axis=1)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbd95998-ec11-4166-93fa-18c0a99c4d6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "gay_root = wordnet.synsets(\"gay\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cdc4a08-2e90-4ab9-ae5e-6c95fd162048",
   "metadata": {},
   "outputs": [],
   "source": [
    "gay = wordnet.synset(\"gay.s.06\").definition()\n",
    "print(gay)\n",
    "hypers, hypos1 = get_parallel(\"gay\",gay,True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34b80b88-3089-44b5-8c5b-13e5f7ea8446",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(hypos1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a134ba49-19cc-4937-b6af-67e044e3bcd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "for root in gay_root:\n",
    "    print(root, root.definition())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "662ff6a8-b5af-4c6a-8102-39b66b85e5d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "wordnet.synsets(\"chinese\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bc77b81-8c43-4cbb-bc7e-a178e76d3659",
   "metadata": {},
   "outputs": [],
   "source": [
    "chinese = wordnet.synset(\"chinese.a.01\").definition()\n",
    "hypers, hypos = get_parallel(\"chinese\",chinese,True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b66bb7a-0ede-48a1-888a-c90e81e2d75d",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemmas = []\n",
    "for hypo in hypos1:\n",
    "    lemmas.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypo[0].lemmas()])\n",
    "lemmas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "221c43f2-05f1-4a48-95a8-eb6a122527e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(lemmas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d75b92b-be76-45c5-b955-d1f64ec03bd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wordnet_parallel_df(\"gay\",seed_definition=gay)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35194a7a-a814-43c6-a57c-c40e54b81847",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29618210-fec7-40b6-b326-107e8570abca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_grouped = df.groupby('Categories').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "407cda3a-1d7a-4863-aa1e-e69860e6cfb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_grouped.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b70c510-997a-4675-963c-ca7000e79eb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "tiny = wordnet.synsets(\"tiny\", wordnet.ADJ)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fe63d4d-b080-49ae-a1b6-487e8b440e76",
   "metadata": {},
   "outputs": [],
   "source": [
    "tiny"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9661c299-369b-4538-86d9-003b3dc9fa5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "tiny[0].lemmas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99a6e4d9-2923-41a9-94b3-09d21c699f21",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_alt = []\n",
    "for lemma in tiny[0].lemmas():\n",
    "    new_alt.extend(wordnet.synsets(lemma.name()))\n",
    "new_alt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ac3e75d-8a0e-44e2-910a-dcfcea86fa9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_alt2 = list(set(new_alt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3617a495-c722-466f-a74b-1e22bf025248",
   "metadata": {},
   "outputs": [],
   "source": [
    "for alt in new_alt2:\n",
    "    print(alt,alt.hypernyms())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc2f7839-f219-4cf8-ab5d-82781016e6c5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}