File size: 4,192 Bytes
ffc3577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import inflect
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re
import string
import wikipedia as wiki


# cleans last sentence of Quiz Bowl question
def clean_last_sent(text):
    cleaned = text.replace('FTP,', 'For 10 points,')
    cleaned = re.sub('(?i)for 10 points,', '', cleaned)

    return cleaned


# gives the question a more "question-like" ending if the question hasn't reached the last sentence yet
def add_proper_tail(text):
    if 'name this' in text.lower() or 'this' not in text.lower():
        return text

    if text[-1] == '.':
        beginning = ' name this '
    else:
        beginning = '. name this '
    words = text.split()
    words = [word.lower() for word in words]
    idx_of_this = words.index('this')
    tail = beginning + words[idx_of_this + 1] + '.'
    new_text = text + tail

    return new_text


# returns the words in the text excluding stop words and punctuation
def get_filtered_words(text):
    cleaned = clean_last_sent(text)
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(cleaned)
    filtered = [word for word in words if not word.lower() in stop_words]
    filtered = [word for word in filtered if word in string.punctuation]

    return filtered


# gets valid query using proper nouns in reverse order
def get_nnp_query(text):
    # take out words in quotes
    text = re.sub('"(.*?)"', '', text)

    # find all proper nouns
    tagged_sent = pos_tag(word_tokenize(text))
    proper_nouns = [word for word, pos in tagged_sent if 'NNP' in pos]
    proper_nouns.reverse()

    query = ''
    for nnp in proper_nouns:
        test_query = query + nnp
        results = wiki.search(test_query)
        if len(results) == 0:
            continue
        query += nnp + ' '

    return query


# gets valid query using nouns in reverse order
def get_nn_query(text):
    # take out words in quotes
    text = re.sub('"(.*?)"', '', text)

    # find all types of nouns
    tagged_sent = pos_tag(word_tokenize(text))
    nouns = [word for word, pos in tagged_sent if 'NN' in pos]
    nouns.reverse()

    query = ''
    for nn in nouns:
        test_query = query + nn
        results = wiki.search(test_query)
        if len(results) == 0:
            continue
        query += nn + ' '

    return query


# helper func to allow use of lambda in map
def lower(s):
    return s.lower()


# checks if either of the texts are subsets of the other
def is_either_text_subset(text1, text2):
    # tokenize words, lower() them, and get the unique words in a set
    text1_set = set(map(lambda word: lower(word), word_tokenize(text1)))
    text2_set = set(map(lambda word: lower(word), word_tokenize(text2)))

    if text1_set.issubset(text2_set) or text2_set.issubset(text1_set):
        return True

    return False


# checks if the text is a wikipedia page
def has_wiki_page(text):
    results = wiki.search(text)
    if not results:
        return False

    title = results[0]
    p = inflect.engine()
    singular = p.singular_noun(text)
    singular = singular if singular else text

    if text.lower() == title.lower() or singular.lower() == title.lower():
        return True

    return False


# uses the 3 functions above to filter the answer set
def filter_answers(set_, text):
    # make sure the answers are more than two characters to avoid random letter(s) which sometime appear
    set_ = {tup for tup in set_ if len(tup[0]) > 2}
    # make sure the answers are not in the question
    set_ = {tup for tup in set_ if not is_either_text_subset(tup[0], text)}
    # make sure the all answers have their own Wikipedia pages
    set_ = {tup for tup in set_ if has_wiki_page(tup[0])}
    return set_


# gets the text of the first result of the given query in a Wikipedia search
def get_wiki_text(query):
    results = wiki.search(query)
    top_page = wiki.page(results[0], auto_suggest=False)
    text = top_page.content

    return text


# splits the text into "chunks" of the requested size
def get_text_chunks(text, size):
    splits = []
    for i in range(0, len(text), size):
        splits.append(text[i: i + size])

    return splits