yu3ufff commited on
Commit
ffc3577
1 Parent(s): 76569e2

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +149 -0
utils.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inflect
2
+ from nltk.corpus import stopwords
3
+ from nltk.tag import pos_tag
4
+ from nltk.tokenize import word_tokenize
5
+ import re
6
+ import string
7
+ import wikipedia as wiki
8
+
9
+
10
+ # cleans last sentence of Quiz Bowl question
11
+ def clean_last_sent(text):
12
+ cleaned = text.replace('FTP,', 'For 10 points,')
13
+ cleaned = re.sub('(?i)for 10 points,', '', cleaned)
14
+
15
+ return cleaned
16
+
17
+
18
+ # gives the question a more "question-like" ending if the question hasn't reached the last sentence yet
19
+ def add_proper_tail(text):
20
+ if 'name this' in text.lower() or 'this' not in text.lower():
21
+ return text
22
+
23
+ if text[-1] == '.':
24
+ beginning = ' name this '
25
+ else:
26
+ beginning = '. name this '
27
+ words = text.split()
28
+ words = [word.lower() for word in words]
29
+ idx_of_this = words.index('this')
30
+ tail = beginning + words[idx_of_this + 1] + '.'
31
+ new_text = text + tail
32
+
33
+ return new_text
34
+
35
+
36
+ # returns the words in the text excluding stop words and punctuation
37
+ def get_filtered_words(text):
38
+ cleaned = clean_last_sent(text)
39
+ stop_words = set(stopwords.words('english'))
40
+ words = word_tokenize(cleaned)
41
+ filtered = [word for word in words if not word.lower() in stop_words]
42
+ filtered = [word for word in filtered if word in string.punctuation]
43
+
44
+ return filtered
45
+
46
+
47
+ # gets valid query using proper nouns in reverse order
48
+ def get_nnp_query(text):
49
+ # take out words in quotes
50
+ text = re.sub('"(.*?)"', '', text)
51
+
52
+ # find all proper nouns
53
+ tagged_sent = pos_tag(word_tokenize(text))
54
+ proper_nouns = [word for word, pos in tagged_sent if 'NNP' in pos]
55
+ proper_nouns.reverse()
56
+
57
+ query = ''
58
+ for nnp in proper_nouns:
59
+ test_query = query + nnp
60
+ results = wiki.search(test_query)
61
+ if len(results) == 0:
62
+ continue
63
+ query += nnp + ' '
64
+
65
+ return query
66
+
67
+
68
+ # gets valid query using nouns in reverse order
69
+ def get_nn_query(text):
70
+ # take out words in quotes
71
+ text = re.sub('"(.*?)"', '', text)
72
+
73
+ # find all types of nouns
74
+ tagged_sent = pos_tag(word_tokenize(text))
75
+ nouns = [word for word, pos in tagged_sent if 'NN' in pos]
76
+ nouns.reverse()
77
+
78
+ query = ''
79
+ for nn in nouns:
80
+ test_query = query + nn
81
+ results = wiki.search(test_query)
82
+ if len(results) == 0:
83
+ continue
84
+ query += nn + ' '
85
+
86
+ return query
87
+
88
+
89
+ # helper func to allow use of lambda in map
90
+ def lower(s):
91
+ return s.lower()
92
+
93
+
94
+ # checks if either of the texts are subsets of the other
95
+ def is_either_text_subset(text1, text2):
96
+ # tokenize words, lower() them, and get the unique words in a set
97
+ text1_set = set(map(lambda word: lower(word), word_tokenize(text1)))
98
+ text2_set = set(map(lambda word: lower(word), word_tokenize(text2)))
99
+
100
+ if text1_set.issubset(text2_set) or text2_set.issubset(text1_set):
101
+ return True
102
+
103
+ return False
104
+
105
+
106
+ # checks if the text is a wikipedia page
107
+ def has_wiki_page(text):
108
+ results = wiki.search(text)
109
+ if not results:
110
+ return False
111
+
112
+ title = results[0]
113
+ p = inflect.engine()
114
+ singular = p.singular_noun(text)
115
+ singular = singular if singular else text
116
+
117
+ if text.lower() == title.lower() or singular.lower() == title.lower():
118
+ return True
119
+
120
+ return False
121
+
122
+
123
+ # uses the 3 functions above to filter the answer set
124
+ def filter_answers(set_, text):
125
+ # make sure the answers are more than two characters to avoid random letter(s) which sometime appear
126
+ set_ = {tup for tup in set_ if len(tup[0]) > 2}
127
+ # make sure the answers are not in the question
128
+ set_ = {tup for tup in set_ if not is_either_text_subset(tup[0], text)}
129
+ # make sure the all answers have their own Wikipedia pages
130
+ set_ = {tup for tup in set_ if has_wiki_page(tup[0])}
131
+ return set_
132
+
133
+
134
+ # gets the text of the first result of the given query in a Wikipedia search
135
+ def get_wiki_text(query):
136
+ results = wiki.search(query)
137
+ top_page = wiki.page(results[0], auto_suggest=False)
138
+ text = top_page.content
139
+
140
+ return text
141
+
142
+
143
+ # splits the text into "chunks" of the requested size
144
+ def get_text_chunks(text, size):
145
+ splits = []
146
+ for i in range(0, len(text), size):
147
+ splits.append(text[i: i + size])
148
+
149
+ return splits