Spaces:
Runtime error
Runtime error
| import inflect | |
| from nltk.corpus import stopwords | |
| from nltk.tag import pos_tag | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| import string | |
| import wikipedia as wiki | |
| # cleans last sentence of Quiz Bowl question | |
| def clean_last_sent(text): | |
| cleaned = text.replace('FTP,', 'For 10 points,') | |
| cleaned = re.sub('(?i)for 10 points,', '', cleaned) | |
| return cleaned | |
| # gives the question a more "question-like" ending if the question hasn't reached the last sentence yet | |
| def add_proper_tail(text): | |
| if 'name this' in text.lower() or 'this' not in text.lower(): | |
| return text | |
| if text[-1] == '.': | |
| beginning = ' name this ' | |
| else: | |
| beginning = '. name this ' | |
| words = text.split() | |
| words = [word.lower() for word in words] | |
| idx_of_this = words.index('this') | |
| tail = beginning + words[idx_of_this + 1] + '.' | |
| new_text = text + tail | |
| return new_text | |
| # returns the words in the text excluding stop words and punctuation | |
| def get_filtered_words(text): | |
| cleaned = clean_last_sent(text) | |
| stop_words = set(stopwords.words('english')) | |
| words = word_tokenize(cleaned) | |
| filtered = [word for word in words if not word.lower() in stop_words] | |
| filtered = [word for word in filtered if word in string.punctuation] | |
| return filtered | |
| # gets valid query using proper nouns in reverse order | |
| def get_nnp_query(text): | |
| # take out words in quotes | |
| text = re.sub('"(.*?)"', '', text) | |
| # find all proper nouns | |
| tagged_sent = pos_tag(word_tokenize(text)) | |
| proper_nouns = [word for word, pos in tagged_sent if 'NNP' in pos] | |
| proper_nouns.reverse() | |
| query = '' | |
| for nnp in proper_nouns: | |
| test_query = query + nnp | |
| results = wiki.search(test_query) | |
| if len(results) == 0: | |
| continue | |
| query += nnp + ' ' | |
| return query | |
| # gets valid query using nouns in reverse order | |
| def get_nn_query(text): | |
| # take out words in quotes | |
| text = re.sub('"(.*?)"', '', text) | |
| # find all types of nouns | |
| tagged_sent = pos_tag(word_tokenize(text)) | |
| nouns = [word for word, pos in tagged_sent if 'NN' in pos] | |
| nouns.reverse() | |
| query = '' | |
| for nn in nouns: | |
| test_query = query + nn | |
| results = wiki.search(test_query) | |
| if len(results) == 0: | |
| continue | |
| query += nn + ' ' | |
| return query | |
| # helper func to allow use of lambda in map | |
| def lower(s): | |
| return s.lower() | |
| # checks if either of the texts are subsets of the other | |
| def is_either_text_subset(text1, text2): | |
| # tokenize words, lower() them, and get the unique words in a set | |
| text1_set = set(map(lambda word: lower(word), word_tokenize(text1))) | |
| text2_set = set(map(lambda word: lower(word), word_tokenize(text2))) | |
| if text1_set.issubset(text2_set) or text2_set.issubset(text1_set): | |
| return True | |
| return False | |
| # checks if the text is a wikipedia page | |
| def has_wiki_page(text): | |
| results = wiki.search(text) | |
| if not results: | |
| return False | |
| title = results[0] | |
| p = inflect.engine() | |
| singular = p.singular_noun(text) | |
| singular = singular if singular else text | |
| if text.lower() == title.lower() or singular.lower() == title.lower(): | |
| return True | |
| return False | |
| # uses the 3 functions above to filter the answer set | |
| def filter_answers(set_, text): | |
| # make sure the answers are more than two characters to avoid random letter(s) which sometime appear | |
| set_ = {tup for tup in set_ if len(tup[0]) > 2} | |
| # make sure the answers are not in the question | |
| set_ = {tup for tup in set_ if not is_either_text_subset(tup[0], text)} | |
| # make sure the all answers have their own Wikipedia pages | |
| set_ = {tup for tup in set_ if has_wiki_page(tup[0])} | |
| return set_ | |
| # gets the text of the first result of the given query in a Wikipedia search | |
| def get_wiki_text(query): | |
| results = wiki.search(query) | |
| top_page = wiki.page(results[0], auto_suggest=False) | |
| text = top_page.content | |
| return text | |
| # splits the text into "chunks" of the requested size | |
| def get_text_chunks(text, size): | |
| splits = [] | |
| for i in range(0, len(text), size): | |
| splits.append(text[i: i + size]) | |
| return splits | |