{ "cells": [ { "cell_type": "markdown", "id": "eeaa927c-b8ef-4ee5-ab03-8257899152fd", "metadata": {}, "source": [ "# Building Dynamic Wordlists from WordNet as a fallback\n", "\n", "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out.\n", "\n", "# Builds a dataframe dynamically from WordNet using NLTK.\n", "def wordnet_df(word,POS=False,seed_definition=None):\n", " pos_options = ['NOUN','VERB','ADJ','ADV']\n", " synonyms, antonyms = syn_ant(word,POS,False)\n", " #print(synonyms, antonyms) #for QA purposes\n", " words = []\n", " cats = []\n", " #WordNet hates spaces so you have to remove them\n", " m_word = word.replace(\" \", \"_\")\n", " \n", " #Allow the user to pick a seed definition if it is not provided directly to the function.\n", " if seed_definition is None:\n", " if POS in pos_options:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n", " else:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n", " for d in range(len(seed_definitions)):\n", " print(f\"{d}: {seed_definitions[d]}\")\n", " choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n", " seed_definition = seed_definitions[choice]\n", " \n", " if POS in pos_options:\n", " for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", "\n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " else:\n", " for syn in wordnet.synsets(m_word):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll)) \n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", "\n", " df = {\"Categories\":cats, \"Words\":words}\n", " df = pd.DataFrame(df) \n", " df = df.drop_duplicates().reset_index()\n", " df = df.drop(\"index\", axis=1)\n", " return df" ] }, { "cell_type": "markdown", "id": "4048815e-8434-4db9-bbb2-652fe0076df3", "metadata": {}, "source": [ "# Building Dynamic Wordlists from WordNet as a fallback\n", "\n", "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out." ] }, { "cell_type": "markdown", "id": "41374b5c-12c0-4e4d-aa73-20db04b280ff", "metadata": {}, "source": [ "# Building Dynamic Wordlists from WordNet as a fallback\n", "\n", "I am using an article from [GeeksforGeeks](https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/amp/) to guide building lists using NLTK's `WordNet`. I am considering that this may be a way to avoid having to build custom lists and want to test it out." ] }, { "cell_type": "code", "execution_count": 1, "id": "26a97377-67be-4903-9bfa-e8660aeb8c90", "metadata": {}, "outputs": [], "source": [ "#Import necessary libraries.\n", "import re, nltk, pandas as pd, numpy as np, ssl\n", "from nltk.corpus import wordnet\n", "import spacy\n", "nlp = spacy.load(\"en_core_web_lg\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "42e7a838-bb82-4736-8f70-127c53fea68b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/nbutters/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#If an error is thrown that the corpus \"omw-1.4\" isn't discoverable you can use this code. (https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)\n", "try:\n", " _create_unverified_https_context = ssl._create_unverified_context\n", "except AttributeError:\n", " pass\n", "else:\n", " ssl._create_default_https_context = _create_unverified_https_context\n", " \n", "nltk.download('omw-1.4')" ] }, { "cell_type": "code", "execution_count": 15, "id": "14918489-e5fe-4898-8d4a-8bc0f7b1d9e0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Synset('bantam.s.01')]\n", "16 [Synset('bantam.n.01'), Synset('bantam.s.01'), Synset('diminutive.n.01'), Synset('bantam.s.01'), Synset('lilliputian.n.01'), Synset('lilliputian.n.02'), Synset('lilliputian.a.01'), Synset('bantam.s.01'), Synset('fiddling.s.01'), Synset('dwarf.n.01'), Synset('bantam.s.01'), Synset('petite.n.01'), Synset('bantam.s.01'), Synset('bantam.s.01'), Synset('flyspeck.n.01'), Synset('bantam.s.01')]\n" ] } ], "source": [ "hypos = wordnet.synsets(\"tiny\")\n", "print(hypos)\n", "new_list = []\n", "for syn in hypos:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " new_list.append(ll)\n", "syns = []\n", "for lemma in new_list:\n", " syns.extend(wordnet.synsets(lemma))\n", "print(len(syns),syns)" ] }, { "cell_type": "code", "execution_count": null, "id": "c3047d11-0512-41af-9db7-62daa8cbb60d", "metadata": {}, "outputs": [], "source": [ "#Here I define a few test sentences from the Duct-Tape-Pipeline.\n", "upt1 = \"I like movies starring black actors.\"\n", "upt2 = \"I am a black trans-woman.\"\n", "upt3 = \"Native Americans deserve to have their land back.\"\n", "upt4 = \"This movie was filmed in Iraq.\"" ] }, { "cell_type": "code", "execution_count": 16, "id": "b52425b5-2c4d-4a31-a240-feabc319198b", "metadata": {}, "outputs": [], "source": [ "# A simple function to pull synonyms and antonyms using spacy's POS\n", "def syn_ant(word,POS=False,human=True):\n", " pos_options = ['NOUN','VERB','ADJ','ADV']\n", " synonyms = [] \n", " antonyms = []\n", " #WordNet hates spaces so you have to remove them\n", " if \" \" in word:\n", " word = word.replace(\" \", \"_\")\n", " \n", " if POS in pos_options:\n", " for syn in wordnet.synsets(word, pos=getattr(wordnet, POS)): \n", " for l in syn.lemmas(): \n", " current = l.name()\n", " if human:\n", " current = re.sub(\"_\",\" \",current)\n", " synonyms.append(current) \n", " if l.antonyms():\n", " for ant in l.antonyms():\n", " cur_ant = ant.name()\n", " if human:\n", " cur_ant = re.sub(\"_\",\" \",cur_ant)\n", " antonyms.append(cur_ant)\n", " else: \n", " for syn in wordnet.synsets(word): \n", " for l in syn.lemmas(): \n", " current = l.name()\n", " if human:\n", " current = re.sub(\"_\",\" \",current)\n", " synonyms.append(current) \n", " if l.antonyms():\n", " for ant in l.antonyms():\n", " cur_ant = ant.name()\n", " if human:\n", " cur_ant = re.sub(\"_\",\" \",cur_ant)\n", " antonyms.append(cur_ant)\n", " synonyms = list(set(synonyms))\n", " antonyms = list(set(antonyms))\n", " return synonyms, antonyms" ] }, { "cell_type": "code", "execution_count": 22, "id": "7cd10cf1-bf0d-4baa-8588-9315cfbe760e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['man', \"gentleman's gentleman\", 'Isle of Man', 'Man', 'humanity', 'human', 'piece', 'valet de chambre', 'mankind', 'humans', 'military personnel', 'adult male', 'homo', 'human race', 'valet', 'humankind', 'military man', 'human being', 'serviceman', 'world', 'gentleman', 'human beings'] ['woman', 'civilian']\n" ] } ], "source": [ "x, q = syn_ant(\"man\")\n", "print(x,q)" ] }, { "cell_type": "code", "execution_count": null, "id": "3b15bcba-ca91-49a0-b873-aff1e61b3053", "metadata": {}, "outputs": [], "source": [ "doc1 = nlp(upt1)\n", "doc2 = nlp(upt2)\n", "doc3 = nlp(upt3)\n", "doc4 = nlp(upt4)" ] }, { "cell_type": "code", "execution_count": null, "id": "1220c67b-1776-4a39-8335-c88b96379122", "metadata": {}, "outputs": [], "source": [ "syn_ant(doc3[0].text,doc3[0].pos_)" ] }, { "cell_type": "code", "execution_count": null, "id": "4cc3ada2-90c1-4ecf-b704-9c4bf5146406", "metadata": {}, "outputs": [], "source": [ "#Discovering that NLTk WordNet uses \"_\" for compounds... and fixed it.\n", "syn_ant(\"Native_American\", \"NOUN\")" ] }, { "cell_type": "code", "execution_count": null, "id": "52253e37-5eb1-42f8-a4b8-046542004349", "metadata": {}, "outputs": [], "source": [ "syn_ant(\"Papua_New_Guinea\")" ] }, { "cell_type": "code", "execution_count": null, "id": "14eb3c2f-2a5a-4db0-a802-172d9902df70", "metadata": {}, "outputs": [], "source": [ "syn_ant(\"hate\")" ] }, { "cell_type": "code", "execution_count": null, "id": "53a3d7af-980a-47bc-b70f-65c1411fba05", "metadata": {}, "outputs": [], "source": [ "russian = wordnet.synset('mother.n.01')\n", "print(russian.hyponyms())\n", "hypos = []\n", "[hypos.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hyponyms.lemmas()]) for hyponyms in russian.hyponyms()]\n", "hypos" ] }, { "cell_type": "code", "execution_count": null, "id": "bd60bca8-96a9-45f5-8c60-e7177ead3f35", "metadata": {}, "outputs": [], "source": [ "hyper_list = wordnet.synset('woman.n.01')\n", "print(hyper_list.hypernyms())\n", "hypers = []\n", "[hypers.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypernyms.lemmas()]) for hypernyms in hyper_list.hypernyms()]\n", "hypers" ] }, { "cell_type": "code", "execution_count": null, "id": "9b3f4a7f-a4a6-4862-a321-42e1d4be406a", "metadata": {}, "outputs": [], "source": [ "hyper_list = wordnet.synset('man.n.01')\n", "print(hyper_list.hypernyms())\n", "hypers = []\n", "[hypers.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypernyms.lemmas()]) for hypernyms in hyper_list.hypernyms()]\n", "hypers" ] }, { "cell_type": "code", "execution_count": null, "id": "b9932230-e38d-444d-9814-8668d4bf596c", "metadata": {}, "outputs": [], "source": [ "parent = wordnet.synset('male.n.02')\n", "print(parent.hyponyms())\n", "hypos = []\n", "[hypos.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hyponyms.lemmas()]) for hyponyms in parent.hyponyms()]\n", "hypos" ] }, { "cell_type": "code", "execution_count": null, "id": "51bf50a7-87e0-485e-b055-b5a59c44db06", "metadata": {}, "outputs": [], "source": [ "hypo2 = [[re.sub(\"_\",\" \",lemma.name()) for lemma in hyponym.lemmas()] for hyponym in parent.hyponyms()]\n", "hypo2" ] }, { "cell_type": "code", "execution_count": null, "id": "96cce82d-854a-4c3f-b2a1-d8d224357b1d", "metadata": {}, "outputs": [], "source": [ "syn_ant(\"white supremacist\",\"NOUN\",human=False)" ] }, { "cell_type": "raw", "id": "f6cc0a50-ec83-4951-a9bf-86e89e334945", "metadata": {}, "source": [ "## Here's an attempt to explore ConceptNet\n", "# I have currently commented it out because it is not as useful for where I'm trying to go.\n", "'''This is an attempt to use [ConceptNet](https://conceptnet.io/), specifically calling their API ([see documentation](https://github.com/commonsense/conceptnet5/wiki/API)). If I can figure out how to build a list of synonyms and antonyms from here then it may represent a good way to set defaults.\n", "\n", "#import the necessary library\n", "import requests\n", "\n", "obj = requests.get('http://api.conceptnet.io/c/en/black').json()\n", "obj.keys()'''" ] }, { "cell_type": "code", "execution_count": null, "id": "310dfd86-d2df-4023-85bc-bed22099b890", "metadata": {}, "outputs": [], "source": [ "# Builds a list dynamically from WordNet using NLTK.\n", "def wordnet_list(word,POS=False):\n", " pos_options = ['NOUN','VERB','ADJ','ADV']\n", " synonyms, antonyms = syn_ant(word,POS,False)\n", " base = []\n", " final = [word]\n", " #WordNet hates spaces so you have to remove them\n", " m_word = word.replace(\" \", \"_\")\n", " \n", " if POS in pos_options:\n", " for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " \n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " else:\n", " for syn in wordnet.synsets(m_word):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " \n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a):\n", " base.extend(syn.hyponyms())\n", " base.append(syn)\n", " base = list(set(base))\n", " for b in base:\n", " cur_words = []\n", " cur_words.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in b.lemmas()])\n", " final.extend(cur_words)\n", "\n", " \n", " \n", " final = list(set(final)) \n", " return final" ] }, { "cell_type": "code", "execution_count": null, "id": "331ad5b4-15da-454f-8b3d-9fe7b131a6d4", "metadata": {}, "outputs": [], "source": [ "wordnet_list(\"white supremacist\", \"NOUN\")" ] }, { "cell_type": "code", "execution_count": null, "id": "866e3ba9-6213-4725-9627-2b5054f996e8", "metadata": {}, "outputs": [], "source": [ "words = wordnet_list(\"girl\", \"NOUN\")\n", "print(f\"The length of the list is {len(words)}.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d284ea9c-71d2-4860-882e-64b280d0d699", "metadata": {}, "outputs": [], "source": [ "text = \"The girl was brought to the front of the class.\"\n", "test_doc = nlp(text)" ] }, { "cell_type": "code", "execution_count": null, "id": "f53cbe5f-50db-4b2c-b59a-66864669b244", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame()\n", "df[\"Words\"] = words\n", "\n", "df[\"Sentences\"] = df.Words.apply(lambda x: text.replace(\"girl\",x))\n", "\n", "df[\"Similarity\"] = df.Words.apply(lambda x: nlp(\"girl\").similarity(nlp(x)[0]))" ] }, { "cell_type": "code", "execution_count": null, "id": "5b4e37dd-f899-47c9-93ea-f92898760819", "metadata": {}, "outputs": [], "source": [ "df.sort_values(by='Similarity', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "27f912c6-bfa9-4604-8d9d-f4502a0f0ea7", "metadata": {}, "outputs": [], "source": [ "df2 = df[df.Similarity > 0].reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "59e35f36-f434-4377-a170-d109ca89dd77", "metadata": {}, "outputs": [], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": null, "id": "2c754d73-9d74-471d-a36f-84a404aa7093", "metadata": {}, "outputs": [], "source": [ "minimum = df2.Similarity.min()\n", "text2 = df2.loc[df2['Similarity'] == minimum, 'Words'].iloc[0]\n", "text2" ] }, { "cell_type": "code", "execution_count": null, "id": "e2853e02-faea-4ce8-800c-70cc6273be02", "metadata": {}, "outputs": [], "source": [ "maximum = df2[df2.Words != \"girl\"].Similarity.max()\n", "text3 = df2.loc[df2['Similarity'] == maximum, 'Words'].iloc[0]\n", "text3" ] }, { "cell_type": "code", "execution_count": null, "id": "3f03e090-6c99-41db-b013-a77f2fec6e4d", "metadata": {}, "outputs": [], "source": [ "df3 = df2[df.Similarity > .5].reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "2bab0a7f-27c7-4dbf-a92b-b0c13d25e5b0", "metadata": {}, "outputs": [], "source": [ "homo = wordnet.synsets('gay')" ] }, { "cell_type": "code", "execution_count": null, "id": "1a1130a9-b5aa-47b4-9e33-738b08f92c7c", "metadata": {}, "outputs": [], "source": [ "for syn in homo:\n", " print(syn.lemmas())" ] }, { "cell_type": "code", "execution_count": null, "id": "d6e48617-aefb-46f6-9961-21da7f81c8d4", "metadata": {}, "outputs": [], "source": [ "mother = wordnet.synsets('homo')\n", "cats = []\n", "words = []\n", "for syn in mother:\n", " lemmas = syn.lemmas()\n", " for lemma in lemmas:\n", " ll = lemma.name()\n", " print(ll)\n", " cats.append(syn.name().split(\".\")[0])\n", " words.append(ll)\n", " \n", "print(cats,words)\n", "print(len(cats),len(words))\n", "df = {\"Categories\":cats, \"Words\":words}\n", "df = pd.DataFrame(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "8d55ba70-a569-429e-88d0-84f99772b9be", "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "097f927b-28e6-4d10-99a2-621bb758bb77", "metadata": {}, "outputs": [], "source": [ "def process_text(text):\n", " doc = nlp(text.lower())\n", " result = []\n", " for token in doc:\n", " if (token.is_stop) or (token.is_punct) or (token.lemma_ == '-PRON-'):\n", " continue\n", " result.append(token.lemma_)\n", " return \" \".join(result)" ] }, { "cell_type": "code", "execution_count": null, "id": "18b4469e-4457-405e-9736-58ab9e8d8ac6", "metadata": {}, "outputs": [], "source": [ "def clean_definition(syn):\n", " #This function removes stop words from sentences to improve on document level similarity for differentiation.\n", " if type(syn) is str:\n", " synset = wordnet.synset(syn).definition()\n", " elif type(syn) is nltk.corpus.reader.wordnet.Synset:\n", " synset = syn.definition()\n", " definition = nlp(\" \".join(token.lemma_ for token in nlp(synset) if not token.is_stop))\n", " return definition\n", "\n", "def check_sim(a,b):\n", " if type(a) is str and type(b) is str:\n", " a = nlp(a)\n", " b = nlp(b)\n", " similarity = a.similarity(b)\n", " return similarity" ] }, { "cell_type": "code", "execution_count": null, "id": "ed2323c6-cee1-4d6b-8d33-a53755036acd", "metadata": {}, "outputs": [], "source": [ "# Builds a dataframe dynamically from WordNet using NLTK.\n", "def wordnet_df(word,POS=False,seed_definition=None):\n", " pos_options = ['NOUN','VERB','ADJ','ADV']\n", " synonyms, antonyms = syn_ant(word,POS,False)\n", " #print(synonyms, antonyms) #for QA purposes\n", " words = []\n", " cats = []\n", " #WordNet hates spaces so you have to remove them\n", " m_word = word.replace(\" \", \"_\")\n", " \n", " #Allow the user to pick a seed definition if it is not provided directly to the function.\n", " if seed_definition is None:\n", " if POS in pos_options:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n", " else:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n", " for d in range(len(seed_definitions)):\n", " print(f\"{d}: {seed_definitions[d]}\")\n", " choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n", " seed_definition = seed_definitions[choice]\n", " \n", " if POS in pos_options:\n", " for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", "\n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a, pos=getattr(wordnet, POS)):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " else:\n", " for syn in wordnet.synsets(m_word):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .7:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll)) \n", " if len(synonyms) > 0:\n", " for w in synonyms:\n", " w = w.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(w):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", " if len(antonyms) > 0:\n", " for a in antonyms:\n", " a = a.replace(\" \",\"_\")\n", " for syn in wordnet.synsets(a):\n", " if check_sim(process_text(seed_definition),process_text(syn.definition())) > .6:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", "\n", " df = {\"Categories\":cats, \"Words\":words}\n", " df = pd.DataFrame(df) \n", " df = df.drop_duplicates().reset_index()\n", " df = df.drop(\"index\", axis=1)\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "id": "2e9224f8-9620-464d-8a27-6b5b2ff27983", "metadata": {}, "outputs": [], "source": [ "df_mother = wordnet_df(\"gay\")\n", "df_mother" ] }, { "cell_type": "code", "execution_count": null, "id": "003c8941-77e9-45de-a1f5-bd3ac8b6d4a2", "metadata": {}, "outputs": [], "source": [ "len(df_mother)" ] }, { "cell_type": "code", "execution_count": null, "id": "3b196716-ee0d-479b-922c-9bac711dd535", "metadata": {}, "outputs": [], "source": [ "test = wordnet.synsets(\"mother\",wordnet.NOUN)" ] }, { "cell_type": "code", "execution_count": null, "id": "bebb801e-6e43-463f-8a3d-7a09b709836e", "metadata": {}, "outputs": [], "source": [ "test" ] }, { "cell_type": "code", "execution_count": null, "id": "b9a87480-1e25-4614-95fc-5aa201efb9c3", "metadata": {}, "outputs": [], "source": [ "test1 = wordnet.synsets('father',wordnet.NOUN)\n", "testx = wordnet.synset(\"mother.n.01\")\n", "for syn in test1:\n", " definition = clean_definition(syn)\n", " test_def = clean_definition(testx)\n", " print(test_def)\n", " print(syn, definition, check_sim(process_text(test_def.text),process_text(definition.text)))" ] }, { "cell_type": "code", "execution_count": null, "id": "1757d434-5f67-465a-9559-34ce2eacf1f1", "metadata": {}, "outputs": [], "source": [ "test = \"colonizer.n.01\"" ] }, { "cell_type": "code", "execution_count": null, "id": "ea8eb48a-49dd-4aae-b5ae-ae0db2bddc49", "metadata": {}, "outputs": [], "source": [ "test2 = \"mother.n.01\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7b99656d-efab-4c6e-8dca-0d604cbd5bbe", "metadata": {}, "outputs": [], "source": [ "mother = nlp(wordnet.synset(\"black.n.05\").definition())\n", "print(mother)\n", "colony = nlp(wordnet.synset(\"white.n.01\").definition())\n", "print(colony)\n", "print(mother.similarity(colony))" ] }, { "cell_type": "code", "execution_count": null, "id": "b735efc4-0c84-4632-b850-7395f27971fe", "metadata": {}, "outputs": [], "source": [ "mother_processed = nlp(process_text(mother.text))\n", "colony_processed = nlp(process_text(colony.text))" ] }, { "cell_type": "code", "execution_count": null, "id": "c5972e22-ae21-4b6a-9c1a-cbd907c8aef1", "metadata": {}, "outputs": [], "source": [ "print(mother_processed.similarity(colony_processed))" ] }, { "cell_type": "code", "execution_count": null, "id": "1046cc13-ea10-4bb1-bd59-daa1507c5c19", "metadata": {}, "outputs": [], "source": [ "a = clean_definition(test)\n", "\n", "b = clean_definition(test2)\n", "\n", "a.similarity(b)" ] }, { "cell_type": "code", "execution_count": null, "id": "d9f46e35-2bc9-4937-bb5f-ce6d268150af", "metadata": {}, "outputs": [], "source": [ "a_p = nlp(process_text(a.text))\n", "b_p = nlp(process_text(b.text))\n", "a_p.similarity(b_p)" ] }, { "cell_type": "code", "execution_count": null, "id": "451815b5-3fa8-48c4-a89a-7b70d19d00da", "metadata": {}, "outputs": [], "source": [ "check_sim(a,b)" ] }, { "cell_type": "code", "execution_count": null, "id": "98519912-fe93-4b9a-85cb-9d7001735627", "metadata": {}, "outputs": [], "source": [ "test3 = wordnet.synset(\"white_supremacist.n.01\")\n", "c = clean_definition(test3)\n", "a.similarity(c)" ] }, { "cell_type": "code", "execution_count": null, "id": "2d44f7ee-f7a7-421e-9cb5-d0f599c2b9ab", "metadata": {}, "outputs": [], "source": [ "def get_parallel(word, seed_definition, QA=False):\n", " cleaned = nlp(process_text(seed_definition))\n", " root_syns = wordnet.synsets(word)\n", " hypers = []\n", " new_hypos = []\n", " \n", " for syn in root_syns:\n", " hypers.extend(syn.hypernyms())\n", " \n", " #hypers = list(set([syn for syn in hypers if cleaned.similarity(nlp(process_text(syn.definition()))) >=.5]))\n", " \n", " for syn in hypers:\n", " new_hypos.extend(syn.hyponyms())\n", " \n", " hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.75]))\n", " print(len(hypos))\n", " if len(hypos) < 3:\n", " hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.5]))\n", " elif len(hypos) <10:\n", " hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.66]))\n", " elif len(hypos) >= 10: \n", " hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.8]))\n", " elif len(hypos) >= 20:\n", " hypos = list(set([(syn, cleaned.similarity(nlp(process_text(syn.definition())))) for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.9]))\n", " if QA:\n", " print(hypers)\n", " print(hypos)\n", " return hypers, hypos\n", " else:\n", " return hypos\n", "\n", "# Builds a dataframe dynamically from WordNet using NLTK.\n", "def wordnet_parallel_df(word,POS=False,seed_definition=None):\n", " pos_options = ['NOUN','VERB','ADJ','ADV']\n", " synonyms, antonyms = syn_ant(word,POS,False)\n", " #print(synonyms, antonyms) #for QA purposes\n", " words = []\n", " cats = []\n", " #WordNet hates spaces so you have to remove them\n", " m_word = word.replace(\" \", \"_\")\n", " \n", " #Allow the user to pick a seed definition if it is not provided directly to the function.\n", " if seed_definition is None:\n", " if POS in pos_options:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]\n", " else:\n", " seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]\n", " for d in range(len(seed_definitions)):\n", " print(f\"{d}: {seed_definitions[d]}\")\n", " choice = int(input(\"Which of the definitions above most aligns to your selection?\"))\n", " seed_definition = seed_definitions[choice]\n", " \n", " hypos = get_parallel(m_word,seed_definition)\n", " for syn,sim in hypos:\n", " cur_lemmas = syn.lemmas()\n", " hypos = syn.hyponyms()\n", " for hypo in hypos:\n", " cur_lemmas.extend(hypo.lemmas())\n", " for lemma in cur_lemmas:\n", " ll = lemma.name()\n", " cats.append(re.sub(\"_\",\" \", syn.name().split(\".\")[0]))\n", " words.append(re.sub(\"_\",\" \",ll))\n", "\n", " df = {\"Categories\":cats, \"Words\":words}\n", " df = pd.DataFrame(df) \n", " df = df.drop_duplicates().reset_index()\n", " df = df.drop(\"index\", axis=1)\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "id": "dbd95998-ec11-4166-93fa-18c0a99c4d6e", "metadata": {}, "outputs": [], "source": [ "gay_root = wordnet.synsets(\"gay\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3cdc4a08-2e90-4ab9-ae5e-6c95fd162048", "metadata": {}, "outputs": [], "source": [ "gay = wordnet.synset(\"gay.s.06\").definition()\n", "print(gay)\n", "hypers, hypos1 = get_parallel(\"gay\",gay,True)" ] }, { "cell_type": "code", "execution_count": null, "id": "34b80b88-3089-44b5-8c5b-13e5f7ea8446", "metadata": {}, "outputs": [], "source": [ "len(hypos1)" ] }, { "cell_type": "code", "execution_count": null, "id": "a134ba49-19cc-4937-b6af-67e044e3bcd2", "metadata": {}, "outputs": [], "source": [ "for root in gay_root:\n", " print(root, root.definition())" ] }, { "cell_type": "code", "execution_count": null, "id": "662ff6a8-b5af-4c6a-8102-39b66b85e5d1", "metadata": {}, "outputs": [], "source": [ "wordnet.synsets(\"chinese\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4bc77b81-8c43-4cbb-bc7e-a178e76d3659", "metadata": {}, "outputs": [], "source": [ "chinese = wordnet.synset(\"chinese.a.01\").definition()\n", "hypers, hypos = get_parallel(\"chinese\",chinese,True)" ] }, { "cell_type": "code", "execution_count": null, "id": "8b66bb7a-0ede-48a1-888a-c90e81e2d75d", "metadata": {}, "outputs": [], "source": [ "lemmas = []\n", "for hypo in hypos1:\n", " lemmas.extend([re.sub(\"_\",\" \",lemma.name()) for lemma in hypo[0].lemmas()])\n", "lemmas" ] }, { "cell_type": "code", "execution_count": null, "id": "221c43f2-05f1-4a48-95a8-eb6a122527e9", "metadata": {}, "outputs": [], "source": [ "len(lemmas)" ] }, { "cell_type": "code", "execution_count": null, "id": "3d75b92b-be76-45c5-b955-d1f64ec03bd4", "metadata": {}, "outputs": [], "source": [ "df = wordnet_parallel_df(\"gay\",seed_definition=gay)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "35194a7a-a814-43c6-a57c-c40e54b81847", "metadata": {}, "outputs": [], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "29618210-fec7-40b6-b326-107e8570abca", "metadata": {}, "outputs": [], "source": [ "df_grouped = df.groupby('Categories').count()" ] }, { "cell_type": "code", "execution_count": null, "id": "407cda3a-1d7a-4863-aa1e-e69860e6cfb5", "metadata": {}, "outputs": [], "source": [ "df_grouped.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "3b70c510-997a-4675-963c-ca7000e79eb4", "metadata": {}, "outputs": [], "source": [ "tiny = wordnet.synsets(\"tiny\", wordnet.ADJ)" ] }, { "cell_type": "code", "execution_count": null, "id": "2fe63d4d-b080-49ae-a1b6-487e8b440e76", "metadata": {}, "outputs": [], "source": [ "tiny" ] }, { "cell_type": "code", "execution_count": null, "id": "9661c299-369b-4538-86d9-003b3dc9fa5c", "metadata": {}, "outputs": [], "source": [ "tiny[0].lemmas()" ] }, { "cell_type": "code", "execution_count": null, "id": "99a6e4d9-2923-41a9-94b3-09d21c699f21", "metadata": {}, "outputs": [], "source": [ "new_alt = []\n", "for lemma in tiny[0].lemmas():\n", " new_alt.extend(wordnet.synsets(lemma.name()))\n", "new_alt" ] }, { "cell_type": "code", "execution_count": null, "id": "7ac3e75d-8a0e-44e2-910a-dcfcea86fa9f", "metadata": {}, "outputs": [], "source": [ "new_alt2 = list(set(new_alt))" ] }, { "cell_type": "code", "execution_count": null, "id": "3617a495-c722-466f-a74b-1e22bf025248", "metadata": {}, "outputs": [], "source": [ "for alt in new_alt2:\n", " print(alt,alt.hypernyms())" ] }, { "cell_type": "code", "execution_count": null, "id": "cc2f7839-f219-4cf8-ab5d-82781016e6c5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }