Spaces:
Runtime error
Runtime error
ValadisCERTH
commited on
Commit
•
b2a1bba
1
Parent(s):
fe70a3e
Update magnitudeIdentification.py
Browse files- magnitudeIdentification.py +296 -187
magnitudeIdentification.py
CHANGED
@@ -1,211 +1,320 @@
|
|
1 |
import spacy
|
2 |
import re
|
3 |
-
|
4 |
-
from datetime import datetime
|
5 |
|
6 |
# Load the spacy model with GloVe embeddings
|
7 |
nlp = spacy.load("en_core_web_lg")
|
8 |
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
try:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
try:
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
else:
|
163 |
-
dt = datetime.strptime(date_str, '%d/%m/%y')
|
164 |
-
output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}')
|
165 |
-
except ValueError:
|
166 |
-
output_list.append(f'INVALID FORMAT: {date_str}')
|
167 |
|
168 |
-
|
169 |
-
|
|
|
170 |
|
|
|
|
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references
|
175 |
-
'''
|
176 |
|
177 |
-
|
|
|
|
|
|
|
|
|
178 |
|
179 |
-
|
180 |
-
|
|
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
185 |
|
186 |
-
|
187 |
-
|
188 |
|
189 |
-
|
|
|
190 |
|
191 |
-
# in case there is a wrong date format then return the appropriate code to prompt back the proper message
|
192 |
-
if 'INVALID FORMAT' in formatted_dates[0]:
|
193 |
-
return (0,'DATES','wrong_date_format')
|
194 |
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
return (0,'DATES','more_dates')
|
205 |
|
206 |
-
# in case of unexpected error return the appropriate code (to aid returning the correct prompt)
|
207 |
-
else:
|
208 |
-
return (0,'DATES','unknown_error')
|
209 |
|
210 |
-
except:
|
211 |
-
return (0,'DATES','unknown_error')
|
|
|
1 |
import spacy
|
2 |
import re
|
3 |
+
from word2number import w2n
|
|
|
4 |
|
5 |
# Load the spacy model with GloVe embeddings
|
6 |
nlp = spacy.load("en_core_web_lg")
|
7 |
|
8 |
|
9 |
+
def capture_numbers(input_sentence):
|
10 |
+
'''
|
11 |
+
This is a function to capture cases of refered numbers either in numeric or free-text form
|
12 |
+
'''
|
13 |
+
|
14 |
+
try:
|
15 |
+
# Define the regular expression patterns
|
16 |
+
pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
|
17 |
+
|
18 |
+
# Find all matches in the text
|
19 |
+
matches = re.findall(pattern1, input_sentence)
|
20 |
+
|
21 |
+
# This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
|
22 |
+
pattern_numbers = []
|
23 |
+
for match in matches:
|
24 |
+
if len(match) == 3:
|
25 |
+
# add the $pattern string to easily specify them in a subsequent step
|
26 |
+
full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
|
27 |
+
pattern_numbers.append(full_string)
|
28 |
+
|
29 |
+
for elem in pattern_numbers:
|
30 |
+
input_sentence = input_sentence.replace(elem, " ")
|
31 |
+
|
32 |
+
if pattern_numbers:
|
33 |
+
# Remove duplicates with set and convert back to list
|
34 |
+
pattern_final_numbers = list(set(pattern_numbers))
|
35 |
+
else:
|
36 |
+
pattern_final_numbers = []
|
37 |
+
|
38 |
+
# we delete the captured references from the sentence, because if we capture something like seven point five
|
39 |
+
# then spacy will also identify seven and five, which we do not want it to
|
40 |
+
for element in pattern_final_numbers:
|
41 |
+
target_elem = element.replace("$pattern", "").strip()
|
42 |
+
if target_elem in input_sentence:
|
43 |
+
input_sentence = input_sentence.replace(target_elem, " ")
|
44 |
+
|
45 |
+
# This is for cases of thirty eight or one million and two, etc.
|
46 |
+
|
47 |
+
# Define a regular expression to match multiword free-text numbers
|
48 |
+
pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
|
49 |
+
|
50 |
+
# Find all multiword free-text number matches in the sentence
|
51 |
+
multi_numbers = re.findall(pattern2, input_sentence)
|
52 |
+
|
53 |
+
if multi_numbers:
|
54 |
+
multinumber_final_numbers = list(set(multi_numbers))
|
55 |
+
else:
|
56 |
+
multinumber_final_numbers = []
|
57 |
+
|
58 |
+
for elem in multinumber_final_numbers:
|
59 |
+
if elem in input_sentence:
|
60 |
+
input_sentence = input_sentence.replace(elem, " ")
|
61 |
+
|
62 |
+
# we also delete the captured references from the sentence in this case
|
63 |
+
for element in multinumber_final_numbers:
|
64 |
+
target_elem = element.replace("$pattern", "").strip()
|
65 |
+
if target_elem in input_sentence:
|
66 |
+
input_sentence = input_sentence.replace(target_elem, " ")
|
67 |
+
|
68 |
+
# Parse the input sentence with Spacy
|
69 |
+
doc = nlp(input_sentence)
|
70 |
+
|
71 |
+
# This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
|
72 |
+
s_numbers = [token.text for token in doc if token.like_num]
|
73 |
+
|
74 |
+
if s_numbers:
|
75 |
+
# Remove duplicates with set and convert back to list
|
76 |
+
spacy_final_numbers = list(set(s_numbers))
|
77 |
+
|
78 |
+
else:
|
79 |
+
spacy_final_numbers = []
|
80 |
+
|
81 |
+
# return the extracted numbers
|
82 |
+
return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
|
83 |
+
|
84 |
+
except:
|
85 |
+
return 0
|
86 |
+
|
87 |
+
|
88 |
+
def numeric_number_dot_freetext(text):
|
89 |
+
'''
|
90 |
+
This is a function to convert cases of '6 point five, six point 5 etc'
|
91 |
+
'''
|
92 |
+
|
93 |
+
try:
|
94 |
+
# # Define a dictionary to map words to numbers
|
95 |
+
num_dict = {
|
96 |
+
'zero': 0,
|
97 |
+
'one': 1,
|
98 |
+
'two': 2,
|
99 |
+
'three': 3,
|
100 |
+
'four': 4,
|
101 |
+
'five': 5,
|
102 |
+
'six': 6,
|
103 |
+
'seven': 7,
|
104 |
+
'eight': 8,
|
105 |
+
'nine': 9,
|
106 |
+
'ten': 10,
|
107 |
+
'eleven': 11,
|
108 |
+
'twelve': 12,
|
109 |
+
'thirteen': 13,
|
110 |
+
'fourteen': 14,
|
111 |
+
'fifteen': 15,
|
112 |
+
'sixteen': 16,
|
113 |
+
'seventeen': 17,
|
114 |
+
'eighteen': 18,
|
115 |
+
'nineteen': 19,
|
116 |
+
'twenty': 20,
|
117 |
+
'thirty': 30,
|
118 |
+
'forty': 40,
|
119 |
+
'fifty': 50,
|
120 |
+
'sixty': 60,
|
121 |
+
'seventy': 70,
|
122 |
+
'eighty': 80,
|
123 |
+
'ninety': 90,
|
124 |
+
'hundred': 100,
|
125 |
+
'thousand': 1000,
|
126 |
+
'million': 1000000,
|
127 |
+
'billion': 1000000000,
|
128 |
+
'trillion': 1000000000000
|
129 |
+
}
|
130 |
+
|
131 |
+
# # Define a regular expression pattern to extract the numeric form and free text form from input text
|
132 |
+
pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
|
133 |
+
|
134 |
+
# Use regular expression to extract the numeric form and free text form from input text
|
135 |
+
match = re.search(pattern, text)
|
136 |
+
|
137 |
+
if match:
|
138 |
+
num1 = match.group(1)
|
139 |
+
num2 = match.group(2)
|
140 |
+
|
141 |
+
# If the numeric form is a word, map it to its numerical value
|
142 |
+
if num1 in num_dict:
|
143 |
+
num1 = num_dict[num1]
|
144 |
+
|
145 |
+
# if not in the dictionary try also with the w2n library
|
146 |
+
else:
|
147 |
+
|
148 |
+
# try to convert to float. That means this is a number, otherwise it is a string so continue
|
149 |
+
try:
|
150 |
+
num1 = float(num1)
|
151 |
+
except:
|
152 |
+
|
153 |
+
# this will handle cases like "bla bla bla seven"
|
154 |
+
try:
|
155 |
+
num1 = w2n.word_to_num(num1)
|
156 |
+
|
157 |
+
# this is to handle cases like "bla bla bla 7"
|
158 |
+
except:
|
159 |
+
|
160 |
+
try:
|
161 |
+
# we identify all the numeric references
|
162 |
+
num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
|
163 |
+
|
164 |
+
# if there is exactly one number then we cope with that
|
165 |
+
if len(num_ref1) == 1:
|
166 |
+
num1 = num_ref1[0]
|
167 |
+
|
168 |
+
# in any other case throw an error
|
169 |
+
elif len(num_ref1) > 1:
|
170 |
+
return (0, 'MAGNITUDE', 'more_magnitude')
|
171 |
+
|
172 |
+
elif len(num_ref1) == 0:
|
173 |
+
return (0, 'MAGNITUDE', 'no_magnitude')
|
174 |
+
|
175 |
+
except:
|
176 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
177 |
+
|
178 |
+
# If the free text form is a word, map it to its numerical value
|
179 |
+
if num2 in num_dict:
|
180 |
+
num2 = num_dict[num2]
|
181 |
+
|
182 |
+
else:
|
183 |
+
try:
|
184 |
+
num2 = int(num2)
|
185 |
+
except:
|
186 |
+
try:
|
187 |
+
num2 = w2n.word_to_num(num2)
|
188 |
+
except:
|
189 |
+
try:
|
190 |
+
# we identify all the numeric references
|
191 |
+
num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
|
192 |
+
|
193 |
+
# if there is exactly one number then we cope with that
|
194 |
+
if len(num_ref2) == 1:
|
195 |
+
num2 = num_ref2[0]
|
196 |
+
|
197 |
+
# in any other case throw an error
|
198 |
+
elif len(num_ref2) > 1:
|
199 |
+
return (0, 'MAGNITUDE', 'more_magnitude')
|
200 |
+
|
201 |
+
elif len(num_ref2) == 0:
|
202 |
+
return (0, 'MAGNITUDE', 'no_magnitude')
|
203 |
+
|
204 |
+
except:
|
205 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
206 |
+
|
207 |
try:
|
208 |
+
# Convert both parts to float and add them together to get the final decimal value
|
209 |
+
result = float(num1) + float(num2) / (10 ** len(str(num2)))
|
210 |
+
return result
|
211 |
+
except:
|
212 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
213 |
+
|
214 |
+
|
215 |
+
else:
|
216 |
+
# If input text doesn't match the expected pattern, return None
|
217 |
+
return 0
|
218 |
+
|
219 |
+
except:
|
220 |
+
return 0
|
221 |
+
|
222 |
+
|
223 |
+
def convert_into_numeric(num_list):
|
224 |
+
'''
|
225 |
+
This is a function to convert the identified numbers into a numeric form
|
226 |
+
'''
|
227 |
+
|
228 |
+
if num_list:
|
229 |
+
|
230 |
+
# at first we examine how many numbers were captured. Only one number should exist
|
231 |
+
if len(num_list) > 1:
|
232 |
+
return (0, 'MAGNITUDE', 'more_magnitude')
|
233 |
+
|
234 |
+
else:
|
235 |
+
target_num = num_list[0]
|
236 |
+
|
237 |
+
# case it is an integer or float, convert it, otherwise move to following cases
|
238 |
try:
|
239 |
+
|
240 |
+
target_num_float = float(target_num)
|
241 |
+
return {'Number': target_num_float}
|
242 |
+
|
243 |
+
except:
|
244 |
+
|
245 |
+
# at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before
|
246 |
+
if ',' in target_num:
|
247 |
+
try:
|
248 |
+
target_num = float(target_num.replace(",", "."))
|
249 |
+
return (0, 'MAGNITUDE', 'format_error')
|
250 |
+
|
251 |
+
except:
|
252 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
253 |
+
|
254 |
else:
|
|
|
|
|
|
|
|
|
255 |
|
256 |
+
# case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
|
257 |
+
if "$pattern" in target_num:
|
258 |
+
num, _ = target_num.split("$")
|
259 |
|
260 |
+
# try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
|
261 |
+
num_conversion = numeric_number_dot_freetext(num)
|
262 |
|
263 |
+
if num_conversion:
|
264 |
+
return {'Number': num_conversion}
|
|
|
|
|
265 |
|
266 |
+
# if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
|
267 |
+
else:
|
268 |
+
try:
|
269 |
+
num_conversion = w2n.word_to_num(target_num)
|
270 |
+
return {'Number': num_conversion}
|
271 |
|
272 |
+
# if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
|
273 |
+
# and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
|
274 |
+
except:
|
275 |
|
276 |
+
try:
|
277 |
+
target_num = target_num.replace(" a ", " ")
|
278 |
+
new_target_num = "one " + target_num
|
279 |
+
num_conversion = w2n.word_to_num(new_target_num)
|
280 |
+
return {'Number': num_conversion}
|
281 |
|
282 |
+
except:
|
283 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
284 |
|
285 |
+
else:
|
286 |
+
return (0, 'MAGNITUDE', 'no_magnitude')
|
287 |
|
|
|
|
|
|
|
288 |
|
289 |
+
def magnitude_binding(input_text):
|
290 |
+
'''
|
291 |
+
This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
|
292 |
+
'''
|
293 |
+
|
294 |
+
try:
|
295 |
+
|
296 |
+
# capture the referred magnitudes
|
297 |
+
target_numbers = capture_numbers(input_text)
|
298 |
|
299 |
+
# we only accept for one magnitude reference
|
300 |
+
if len(target_numbers) == 1:
|
301 |
+
numeric_target_numbers = convert_into_numeric(target_numbers)
|
302 |
+
|
303 |
+
return numeric_target_numbers
|
304 |
+
|
305 |
+
# in case of zero references return the appropriate code (to aid returning the correct prompt)
|
306 |
+
elif len(target_numbers) == 0:
|
307 |
+
return (0, 'MAGNITUDE', 'no_magnitude')
|
308 |
+
|
309 |
+
# in case of more than one references return the appropriate code (to aid returning the correct prompt)
|
310 |
+
elif len(target_numbers) > 1:
|
311 |
+
return (0, 'MAGNITUDE', 'more_magnitude')
|
312 |
+
|
313 |
+
# in case of unexpected error return the appropriate code (to aid returning the correct prompt)
|
314 |
+
else:
|
315 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
316 |
|
317 |
+
except:
|
318 |
+
return (0, 'MAGNITUDE', 'unknown_error')
|
|
|
319 |
|
|
|
|
|
|
|
320 |
|
|
|
|