Spaces:
Runtime error
Runtime error
File size: 4,192 Bytes
ffc3577 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import inflect
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re
import string
import wikipedia as wiki
# cleans last sentence of Quiz Bowl question
def clean_last_sent(text):
cleaned = text.replace('FTP,', 'For 10 points,')
cleaned = re.sub('(?i)for 10 points,', '', cleaned)
return cleaned
# gives the question a more "question-like" ending if the question hasn't reached the last sentence yet
def add_proper_tail(text):
if 'name this' in text.lower() or 'this' not in text.lower():
return text
if text[-1] == '.':
beginning = ' name this '
else:
beginning = '. name this '
words = text.split()
words = [word.lower() for word in words]
idx_of_this = words.index('this')
tail = beginning + words[idx_of_this + 1] + '.'
new_text = text + tail
return new_text
# returns the words in the text excluding stop words and punctuation
def get_filtered_words(text):
cleaned = clean_last_sent(text)
stop_words = set(stopwords.words('english'))
words = word_tokenize(cleaned)
filtered = [word for word in words if not word.lower() in stop_words]
filtered = [word for word in filtered if word in string.punctuation]
return filtered
# gets valid query using proper nouns in reverse order
def get_nnp_query(text):
# take out words in quotes
text = re.sub('"(.*?)"', '', text)
# find all proper nouns
tagged_sent = pos_tag(word_tokenize(text))
proper_nouns = [word for word, pos in tagged_sent if 'NNP' in pos]
proper_nouns.reverse()
query = ''
for nnp in proper_nouns:
test_query = query + nnp
results = wiki.search(test_query)
if len(results) == 0:
continue
query += nnp + ' '
return query
# gets valid query using nouns in reverse order
def get_nn_query(text):
# take out words in quotes
text = re.sub('"(.*?)"', '', text)
# find all types of nouns
tagged_sent = pos_tag(word_tokenize(text))
nouns = [word for word, pos in tagged_sent if 'NN' in pos]
nouns.reverse()
query = ''
for nn in nouns:
test_query = query + nn
results = wiki.search(test_query)
if len(results) == 0:
continue
query += nn + ' '
return query
# helper func to allow use of lambda in map
def lower(s):
return s.lower()
# checks if either of the texts are subsets of the other
def is_either_text_subset(text1, text2):
# tokenize words, lower() them, and get the unique words in a set
text1_set = set(map(lambda word: lower(word), word_tokenize(text1)))
text2_set = set(map(lambda word: lower(word), word_tokenize(text2)))
if text1_set.issubset(text2_set) or text2_set.issubset(text1_set):
return True
return False
# checks if the text is a wikipedia page
def has_wiki_page(text):
results = wiki.search(text)
if not results:
return False
title = results[0]
p = inflect.engine()
singular = p.singular_noun(text)
singular = singular if singular else text
if text.lower() == title.lower() or singular.lower() == title.lower():
return True
return False
# uses the 3 functions above to filter the answer set
def filter_answers(set_, text):
# make sure the answers are more than two characters to avoid random letter(s) which sometime appear
set_ = {tup for tup in set_ if len(tup[0]) > 2}
# make sure the answers are not in the question
set_ = {tup for tup in set_ if not is_either_text_subset(tup[0], text)}
# make sure the all answers have their own Wikipedia pages
set_ = {tup for tup in set_ if has_wiki_page(tup[0])}
return set_
# gets the text of the first result of the given query in a Wikipedia search
def get_wiki_text(query):
results = wiki.search(query)
top_page = wiki.page(results[0], auto_suggest=False)
text = top_page.content
return text
# splits the text into "chunks" of the requested size
def get_text_chunks(text, size):
splits = []
for i in range(0, len(text), size):
splits.append(text[i: i + size])
return splits
|