Spaces:
Runtime error
Runtime error
import inflect | |
from nltk.corpus import stopwords | |
from nltk.tag import pos_tag | |
from nltk.tokenize import word_tokenize | |
import re | |
import string | |
import wikipedia as wiki | |
# cleans last sentence of Quiz Bowl question | |
def clean_last_sent(text): | |
cleaned = text.replace('FTP,', 'For 10 points,') | |
cleaned = re.sub('(?i)for 10 points,', '', cleaned) | |
return cleaned | |
# gives the question a more "question-like" ending if the question hasn't reached the last sentence yet | |
def add_proper_tail(text): | |
if 'name this' in text.lower() or 'this' not in text.lower(): | |
return text | |
if text[-1] == '.': | |
beginning = ' name this ' | |
else: | |
beginning = '. name this ' | |
words = text.split() | |
words = [word.lower() for word in words] | |
idx_of_this = words.index('this') | |
tail = beginning + words[idx_of_this + 1] + '.' | |
new_text = text + tail | |
return new_text | |
# returns the words in the text excluding stop words and punctuation | |
def get_filtered_words(text): | |
cleaned = clean_last_sent(text) | |
stop_words = set(stopwords.words('english')) | |
words = word_tokenize(cleaned) | |
filtered = [word for word in words if not word.lower() in stop_words] | |
filtered = [word for word in filtered if word in string.punctuation] | |
return filtered | |
# gets valid query using proper nouns in reverse order | |
def get_nnp_query(text): | |
# take out words in quotes | |
text = re.sub('"(.*?)"', '', text) | |
# find all proper nouns | |
tagged_sent = pos_tag(word_tokenize(text)) | |
proper_nouns = [word for word, pos in tagged_sent if 'NNP' in pos] | |
proper_nouns.reverse() | |
query = '' | |
for nnp in proper_nouns: | |
test_query = query + nnp | |
results = wiki.search(test_query) | |
if len(results) == 0: | |
continue | |
query += nnp + ' ' | |
return query | |
# gets valid query using nouns in reverse order | |
def get_nn_query(text): | |
# take out words in quotes | |
text = re.sub('"(.*?)"', '', text) | |
# find all types of nouns | |
tagged_sent = pos_tag(word_tokenize(text)) | |
nouns = [word for word, pos in tagged_sent if 'NN' in pos] | |
nouns.reverse() | |
query = '' | |
for nn in nouns: | |
test_query = query + nn | |
results = wiki.search(test_query) | |
if len(results) == 0: | |
continue | |
query += nn + ' ' | |
return query | |
# helper func to allow use of lambda in map | |
def lower(s): | |
return s.lower() | |
# checks if either of the texts are subsets of the other | |
def is_either_text_subset(text1, text2): | |
# tokenize words, lower() them, and get the unique words in a set | |
text1_set = set(map(lambda word: lower(word), word_tokenize(text1))) | |
text2_set = set(map(lambda word: lower(word), word_tokenize(text2))) | |
if text1_set.issubset(text2_set) or text2_set.issubset(text1_set): | |
return True | |
return False | |
# checks if the text is a wikipedia page | |
def has_wiki_page(text): | |
results = wiki.search(text) | |
if not results: | |
return False | |
title = results[0] | |
p = inflect.engine() | |
singular = p.singular_noun(text) | |
singular = singular if singular else text | |
if text.lower() == title.lower() or singular.lower() == title.lower(): | |
return True | |
return False | |
# uses the 3 functions above to filter the answer set | |
def filter_answers(set_, text): | |
# make sure the answers are more than two characters to avoid random letter(s) which sometime appear | |
set_ = {tup for tup in set_ if len(tup[0]) > 2} | |
# make sure the answers are not in the question | |
set_ = {tup for tup in set_ if not is_either_text_subset(tup[0], text)} | |
# make sure the all answers have their own Wikipedia pages | |
set_ = {tup for tup in set_ if has_wiki_page(tup[0])} | |
return set_ | |
# gets the text of the first result of the given query in a Wikipedia search | |
def get_wiki_text(query): | |
results = wiki.search(query) | |
top_page = wiki.page(results[0], auto_suggest=False) | |
text = top_page.content | |
return text | |
# splits the text into "chunks" of the requested size | |
def get_text_chunks(text, size): | |
splits = [] | |
for i in range(0, len(text), size): | |
splits.append(text[i: i + size]) | |
return splits | |