xiomarablanco commited on
Commit
155d7f2
1 Parent(s): 4714ef8

cambio 'es_core_news_sm' por '/path/to/es_core_news_sm'

Browse files
Files changed (1) hide show
  1. codeScripts/utils.py +339 -338
codeScripts/utils.py CHANGED
@@ -1,339 +1,340 @@
1
- import json
2
- import numpy as np
3
- import hunspell
4
- import nltk
5
- import nltk.corpus
6
- from nltk.tokenize import sent_tokenize
7
- from nltk.tokenize import word_tokenize
8
- from nltk import ne_chunk
9
- import re
10
- import yake
11
- import spacy
12
- #dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')
13
-
14
- nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
15
-
16
- # Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)
17
-
18
- #class char():
19
- #def __init__(self):
20
- # pass
21
-
22
- class char_line():
23
- def __init__(self, word):
24
- self.word = word
25
- self.char_line = [(char, self.char_type(char)) for char in word]
26
- self.type_line = ''.join(chartype for char, chartype in self.char_line)
27
-
28
- def char_type(self, char):
29
- if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
30
- return 'V' #strong vowel
31
- if char in set(['i', 'u', 'ü']):
32
- return 'v' #week vowel
33
- if char=='x':
34
- return 'x'
35
- if char=='s':
36
- return 's'
37
- else:
38
- return 'c'
39
-
40
- def find(self, finder):
41
- return self.type_line.find(finder)
42
-
43
- def split(self, pos, where):
44
- return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
45
-
46
- def split_by(self, finder, where):
47
- split_point = self.find(finder)
48
- if split_point!=-1:
49
- chl1, chl2 = self.split(split_point, where)
50
- return chl1, chl2
51
- return self, False
52
-
53
- def __str__(self):
54
- return self.word
55
-
56
- def __repr__(self):
57
- return repr(self.word)
58
-
59
- class silabizer():
60
- def __init__(self):
61
- self.grammar = []
62
-
63
- def split(self, chars):
64
- rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
65
- for split_rule, where in rules:
66
- first, second = chars.split_by(split_rule,where)
67
- if second:
68
- if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
69
- #print 'skip1', first.word, second.word, split_rule, chars.type_line
70
- continue
71
- if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
72
- continue
73
- if first.word[-1]=='l' and second.word[-1]=='l':
74
- continue
75
- if first.word[-1]=='r' and second.word[-1]=='r':
76
- continue
77
- if first.word[-1]=='c' and second.word[-1]=='h':
78
- continue
79
- return self.split(first)+self.split(second)
80
- return [chars]
81
-
82
- def __call__(self, word):
83
- return self.split(char_line(word))
84
-
85
- # Contador número de frases y palabras empleadas en la respuesta
86
- def check_senteces_words(student_answer):
87
-
88
- # Tokenizing into sentences
89
- sentences=[]
90
- words=[]
91
- letter_per_word=[]
92
- syll=0 # syllables counter
93
-
94
- TokenizeAnswer = sent_tokenize(student_answer)
95
- for token in TokenizeAnswer:
96
- regex = '\\.'
97
- token = re.sub(regex , '', token)
98
- sentences.append(token)
99
- for i in range(len(sentences)):
100
- word = sentences[i].split(' ')
101
- for j in range(len(word)):
102
- words.append(word[j])
103
- syllables = silabizer()
104
- syll=syll+len(syllables(word[j]))
105
- letter_per_word.append(len(word[j]))
106
-
107
- sentencesLenght = len(sentences)
108
- wordsLenght = (len(words))
109
- #print(f'Number of senteces used in the answer: {sentencesLenght}')
110
- #print(f'Number of words used in the answer: {wordsLenght}')
111
-
112
- return sentencesLenght, wordsLenght, syll, letter_per_word
113
-
114
- # Contador faltas de ortografía
115
- def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):
116
-
117
- dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
118
- errors=0
119
- words = student_answer.split(' ')
120
- wrong_words = []
121
- for word in words:
122
- for element in clean_words(word):
123
- if not dic.spell(element):
124
- #print(f'Spelling mistake: {element}')
125
- wrong_words.append(element)
126
- errors+=1
127
- #print(f'Spelling mistakes: {errors}')
128
- return errors,wrong_words
129
-
130
- # Legibilidad de la respuesta en función del índice Fernández-Huerta
131
- def FHuertas_index(sentencesLenght, wordsLenght, syll):
132
- FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght)
133
- FH = round(FH, 3)
134
- legibilidad_fh = ""
135
- #print(f'\nFernández-Huerta Index: {FH}')
136
- if 0 < FH <= 30:
137
- #print('Legibilidad FH: muy difícil.')
138
- legibilidad_fh = 'muy díficil'
139
- if 30 < FH <= 50:
140
- #print('Legibilidad FH: difícil.')
141
- legibilidad_fh = 'díficil'
142
- if 50 < FH <= 60:
143
- #print('Legibilidad FH: ligeramente difícil.')
144
- legibilidad_fh = 'ligeramente díficil'
145
- if 60 < FH <= 70:
146
- #print('Legibilidad FH: adecuado.')
147
- legibilidad_fh = 'adecuado'
148
- if 70 < FH <= 80:
149
- #print('Legibilidad FH: ligeramente fácil.')
150
- legibilidad_fh = 'ligeramente fácil'
151
- if 80 < FH <= 90:
152
- #print('Legibilidad FH: fácil.')
153
- legibilidad_fh = 'fácil'
154
- if 90 < FH <= 100:
155
- #print('Legibilidad FH: muy fácil.')
156
- legibilidad_fh = 'muy fácil'
157
-
158
- return FH, legibilidad_fh
159
-
160
- # Legibilidad de la respuesta en función del índice mu
161
- def mu_index(sentencesLenght, wordsLenght, letter_per_word):
162
- med = np.mean(letter_per_word)
163
- var = np.var(letter_per_word)
164
- mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
165
- mu=round(mu, 3)
166
-
167
- legibilidad_mu = ""
168
- #print(f'\nMu index: {mu}')
169
- if 0 < mu <= 30:
170
- #print('Legibilidad Mu: muy difícil.')
171
- legibilidad_mu = 'muy difícil'
172
- if 30 < mu <= 50:
173
- #print('Legibilidad Mu: difícil.')
174
- legibilidad_mu = 'difícil'
175
- if 50 < mu <= 60:
176
- #print('Legibilidad Mu: ligeramente difícil.')
177
- legibilidad_mu = 'ligeramente difícil'
178
- if 60 < mu <= 70:
179
- #print('Legibilidad Mu: adecuado.')
180
- legibilidad_mu = 'adecuado'
181
- if 70 < mu <= 80:
182
- #print('Legibilidad Mu: ligeramente fácil.')
183
- legibilidad_mu = 'ligeramente fácil'
184
- if 80 < mu <= 90:
185
- #print('Legibilidad Mu: fácil.')
186
- legibilidad_mu = 'fácil'
187
- if 90 < mu <= 100:
188
- #print('Legibilidad Mu: muy fácil.')
189
- legibilidad_mu = 'muy fácil'
190
-
191
- return mu, legibilidad_mu
192
-
193
- # Extractor de las kewords de un texto con librería yake
194
- def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
195
- test_keywords=[]
196
- # Deleting special characters and set text in lower case
197
- regex = '\\\n'
198
- text = re.sub(regex , ' ', text)
199
- text = text.lower()
200
- custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
201
- keywords = custom_kw_extractor.extract_keywords(text)
202
- for kw in keywords:
203
- test_keywords.append(kw[0])
204
- return test_keywords
205
-
206
- # categorización de palabras
207
- def word_categorization(student_answer):
208
- fileDocument=[]
209
- TokenizeAnswer = sent_tokenize(student_answer)
210
- for token in TokenizeAnswer:
211
- fileDocument.append(token)
212
- sentencesLenght = len(fileDocument)
213
- sentence=0
214
- while sentence < sentencesLenght:
215
- # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
216
- word_tokens = word_tokenize(fileDocument[sentence])
217
- doc = nlp(fileDocument[sentence])
218
- pre_chunk = [(w.text, w.pos_) for w in doc]
219
- #print(pre_chunk)
220
- sentence += 1
221
- #pre_chunk = nltk.pos_tag(word_tokens)
222
- tree = ne_chunk(pre_chunk) # same tagging than before
223
- #grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
224
-
225
- # Chunking rules to filter out:
226
- grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
227
- grammar = r"""
228
- NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
229
- {<NNP>+} # chunk sequences of proper nouns
230
- """
231
- chunk_parser = nltk.RegexpParser(grammer_np)
232
- chunk_result = chunk_parser.parse(tree)
233
-
234
- #..................................................................................................
235
- def char_split(word, character):
236
- palabra1=""
237
- palabra2=""
238
- found = 0
239
- for w in word:
240
- if w == character and not found:
241
- found = 1
242
- else:
243
- if not found:
244
- palabra1 = palabra1 + w
245
- else:
246
- palabra2 = palabra2 + w
247
-
248
- return [palabra1, palabra2]
249
-
250
- def clean_words(string):
251
- words_sentence = []
252
- for w in string:
253
- if not w.isalnum():
254
- if char_split(string, w)[0] != "":
255
- words_sentence.append(char_split(string, w)[0])
256
- string = char_split(string, w)[len(char_split(string, w))-1]
257
-
258
- if string != "":
259
- words_sentence.append(string)
260
- return words_sentence
261
-
262
- def getNameFile(string):
263
- directories = string.split("/")
264
- return re.sub(".json","", directories[len(directories)-1])
265
-
266
-
267
- def getIDrange(rango_ID, df):
268
- if rango_ID == "All":
269
- IDs = list(range(len(df['hashed_id'])))
270
- else:
271
- rango = []
272
- r= rango_ID.split(",")
273
- for i in r:
274
- c_w= clean_words(i)
275
- if len(c_w) == 2:
276
- rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
277
- elif len(c_w) == 1:
278
- rango.append(int(c_w[0]) -1)
279
- IDs = rango
280
-
281
- return IDs
282
-
283
- def save_json(path, data, isIndent = True):
284
- if isIndent:
285
- json_object = json.dumps(data, indent = 11, ensure_ascii= False)
286
- else:
287
- json_object = json.dumps(data, ensure_ascii= False)
288
- # Writing output to a json file
289
- with open(path, "w") as outfile:
290
- outfile.write(json_object)
291
-
292
-
293
- def load_json(path):
294
- with open(path, "r", encoding="utf8") as f:
295
- data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
296
-
297
- return data
298
-
299
- def load_json_dtset(path):
300
- with open(path, "r", encoding="latin-1") as f:
301
- data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
302
-
303
- return data
304
-
305
-
306
- def splitResponse(respuesta_alumno_raw):
307
- #pre-processing the student's response
308
- regex = '\\\n'
309
- respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
310
- respuesta_alumno = respuesta_alumno.lower()
311
-
312
- #stacking each sentence of the student's response
313
- sentences=[]
314
- TokenizeAnswer = sent_tokenize(respuesta_alumno)
315
- for token in TokenizeAnswer:
316
- regex = '\\.'
317
- token = re.sub(regex , '', token)
318
- sentences.append(token)
319
-
320
- return sentences
321
-
322
- def create_file_path(file, doctype):
323
- """
324
- This function is to create relative paths to store data.
325
- Inputs:
326
- file: the file or subpath + file where the info is to be stored
327
- doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
328
- Outputs:
329
- path: the generated path
330
- """
331
- if doctype == 1:
332
- path = "api/" + file
333
- elif doctype == 2:
334
- path = "archivos/OutputFiles2/" + file
335
- elif doctype == 3:
336
- path = "archivos/Images/" + file
337
- else:
338
- path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
 
339
  return path
 
1
+ import json
2
+ import numpy as np
3
+ import hunspell
4
+ import nltk
5
+ import nltk.corpus
6
+ from nltk.tokenize import sent_tokenize
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk import ne_chunk
9
+ import re
10
+ import yake
11
+ import spacy
12
+ #dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')
13
+
14
+ nlp = spacy.load('/path/to/es_core_news_sm')
15
+ #nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
16
+
17
+ # Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)
18
+
19
+ #class char():
20
+ #def __init__(self):
21
+ # pass
22
+
23
+ class char_line():
24
+ def __init__(self, word):
25
+ self.word = word
26
+ self.char_line = [(char, self.char_type(char)) for char in word]
27
+ self.type_line = ''.join(chartype for char, chartype in self.char_line)
28
+
29
+ def char_type(self, char):
30
+ if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
31
+ return 'V' #strong vowel
32
+ if char in set(['i', 'u', 'ü']):
33
+ return 'v' #week vowel
34
+ if char=='x':
35
+ return 'x'
36
+ if char=='s':
37
+ return 's'
38
+ else:
39
+ return 'c'
40
+
41
+ def find(self, finder):
42
+ return self.type_line.find(finder)
43
+
44
+ def split(self, pos, where):
45
+ return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
46
+
47
+ def split_by(self, finder, where):
48
+ split_point = self.find(finder)
49
+ if split_point!=-1:
50
+ chl1, chl2 = self.split(split_point, where)
51
+ return chl1, chl2
52
+ return self, False
53
+
54
+ def __str__(self):
55
+ return self.word
56
+
57
+ def __repr__(self):
58
+ return repr(self.word)
59
+
60
+ class silabizer():
61
+ def __init__(self):
62
+ self.grammar = []
63
+
64
+ def split(self, chars):
65
+ rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
66
+ for split_rule, where in rules:
67
+ first, second = chars.split_by(split_rule,where)
68
+ if second:
69
+ if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
70
+ #print 'skip1', first.word, second.word, split_rule, chars.type_line
71
+ continue
72
+ if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
73
+ continue
74
+ if first.word[-1]=='l' and second.word[-1]=='l':
75
+ continue
76
+ if first.word[-1]=='r' and second.word[-1]=='r':
77
+ continue
78
+ if first.word[-1]=='c' and second.word[-1]=='h':
79
+ continue
80
+ return self.split(first)+self.split(second)
81
+ return [chars]
82
+
83
+ def __call__(self, word):
84
+ return self.split(char_line(word))
85
+
86
+ # Contador número de frases y palabras empleadas en la respuesta
87
+ def check_senteces_words(student_answer):
88
+
89
+ # Tokenizing into sentences
90
+ sentences=[]
91
+ words=[]
92
+ letter_per_word=[]
93
+ syll=0 # syllables counter
94
+
95
+ TokenizeAnswer = sent_tokenize(student_answer)
96
+ for token in TokenizeAnswer:
97
+ regex = '\\.'
98
+ token = re.sub(regex , '', token)
99
+ sentences.append(token)
100
+ for i in range(len(sentences)):
101
+ word = sentences[i].split(' ')
102
+ for j in range(len(word)):
103
+ words.append(word[j])
104
+ syllables = silabizer()
105
+ syll=syll+len(syllables(word[j]))
106
+ letter_per_word.append(len(word[j]))
107
+
108
+ sentencesLenght = len(sentences)
109
+ wordsLenght = (len(words))
110
+ #print(f'Number of senteces used in the answer: {sentencesLenght}')
111
+ #print(f'Number of words used in the answer: {wordsLenght}')
112
+
113
+ return sentencesLenght, wordsLenght, syll, letter_per_word
114
+
115
+ # Contador faltas de ortografía
116
+ def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):
117
+
118
+ dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
119
+ errors=0
120
+ words = student_answer.split(' ')
121
+ wrong_words = []
122
+ for word in words:
123
+ for element in clean_words(word):
124
+ if not dic.spell(element):
125
+ #print(f'Spelling mistake: {element}')
126
+ wrong_words.append(element)
127
+ errors+=1
128
+ #print(f'Spelling mistakes: {errors}')
129
+ return errors,wrong_words
130
+
131
+ # Legibilidad de la respuesta en función del índice Fernández-Huerta
132
+ def FHuertas_index(sentencesLenght, wordsLenght, syll):
133
+ FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght)
134
+ FH = round(FH, 3)
135
+ legibilidad_fh = ""
136
+ #print(f'\nFernández-Huerta Index: {FH}')
137
+ if 0 < FH <= 30:
138
+ #print('Legibilidad FH: muy difícil.')
139
+ legibilidad_fh = 'muy díficil'
140
+ if 30 < FH <= 50:
141
+ #print('Legibilidad FH: difícil.')
142
+ legibilidad_fh = 'díficil'
143
+ if 50 < FH <= 60:
144
+ #print('Legibilidad FH: ligeramente difícil.')
145
+ legibilidad_fh = 'ligeramente díficil'
146
+ if 60 < FH <= 70:
147
+ #print('Legibilidad FH: adecuado.')
148
+ legibilidad_fh = 'adecuado'
149
+ if 70 < FH <= 80:
150
+ #print('Legibilidad FH: ligeramente fácil.')
151
+ legibilidad_fh = 'ligeramente fácil'
152
+ if 80 < FH <= 90:
153
+ #print('Legibilidad FH: fácil.')
154
+ legibilidad_fh = 'fácil'
155
+ if 90 < FH <= 100:
156
+ #print('Legibilidad FH: muy fácil.')
157
+ legibilidad_fh = 'muy fácil'
158
+
159
+ return FH, legibilidad_fh
160
+
161
+ # Legibilidad de la respuesta en función del índice mu
162
+ def mu_index(sentencesLenght, wordsLenght, letter_per_word):
163
+ med = np.mean(letter_per_word)
164
+ var = np.var(letter_per_word)
165
+ mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
166
+ mu=round(mu, 3)
167
+
168
+ legibilidad_mu = ""
169
+ #print(f'\nMu index: {mu}')
170
+ if 0 < mu <= 30:
171
+ #print('Legibilidad Mu: muy difícil.')
172
+ legibilidad_mu = 'muy difícil'
173
+ if 30 < mu <= 50:
174
+ #print('Legibilidad Mu: difícil.')
175
+ legibilidad_mu = 'difícil'
176
+ if 50 < mu <= 60:
177
+ #print('Legibilidad Mu: ligeramente difícil.')
178
+ legibilidad_mu = 'ligeramente difícil'
179
+ if 60 < mu <= 70:
180
+ #print('Legibilidad Mu: adecuado.')
181
+ legibilidad_mu = 'adecuado'
182
+ if 70 < mu <= 80:
183
+ #print('Legibilidad Mu: ligeramente fácil.')
184
+ legibilidad_mu = 'ligeramente fácil'
185
+ if 80 < mu <= 90:
186
+ #print('Legibilidad Mu: fácil.')
187
+ legibilidad_mu = 'fácil'
188
+ if 90 < mu <= 100:
189
+ #print('Legibilidad Mu: muy fácil.')
190
+ legibilidad_mu = 'muy fácil'
191
+
192
+ return mu, legibilidad_mu
193
+
194
+ # Extractor de las kewords de un texto con librería yake
195
+ def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
196
+ test_keywords=[]
197
+ # Deleting special characters and set text in lower case
198
+ regex = '\\\n'
199
+ text = re.sub(regex , ' ', text)
200
+ text = text.lower()
201
+ custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
202
+ keywords = custom_kw_extractor.extract_keywords(text)
203
+ for kw in keywords:
204
+ test_keywords.append(kw[0])
205
+ return test_keywords
206
+
207
+ # categorización de palabras
208
+ def word_categorization(student_answer):
209
+ fileDocument=[]
210
+ TokenizeAnswer = sent_tokenize(student_answer)
211
+ for token in TokenizeAnswer:
212
+ fileDocument.append(token)
213
+ sentencesLenght = len(fileDocument)
214
+ sentence=0
215
+ while sentence < sentencesLenght:
216
+ # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
217
+ word_tokens = word_tokenize(fileDocument[sentence])
218
+ doc = nlp(fileDocument[sentence])
219
+ pre_chunk = [(w.text, w.pos_) for w in doc]
220
+ #print(pre_chunk)
221
+ sentence += 1
222
+ #pre_chunk = nltk.pos_tag(word_tokens)
223
+ tree = ne_chunk(pre_chunk) # same tagging than before
224
+ #grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
225
+
226
+ # Chunking rules to filter out:
227
+ grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
228
+ grammar = r"""
229
+ NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
230
+ {<NNP>+} # chunk sequences of proper nouns
231
+ """
232
+ chunk_parser = nltk.RegexpParser(grammer_np)
233
+ chunk_result = chunk_parser.parse(tree)
234
+
235
+ #..................................................................................................
236
+ def char_split(word, character):
237
+ palabra1=""
238
+ palabra2=""
239
+ found = 0
240
+ for w in word:
241
+ if w == character and not found:
242
+ found = 1
243
+ else:
244
+ if not found:
245
+ palabra1 = palabra1 + w
246
+ else:
247
+ palabra2 = palabra2 + w
248
+
249
+ return [palabra1, palabra2]
250
+
251
+ def clean_words(string):
252
+ words_sentence = []
253
+ for w in string:
254
+ if not w.isalnum():
255
+ if char_split(string, w)[0] != "":
256
+ words_sentence.append(char_split(string, w)[0])
257
+ string = char_split(string, w)[len(char_split(string, w))-1]
258
+
259
+ if string != "":
260
+ words_sentence.append(string)
261
+ return words_sentence
262
+
263
+ def getNameFile(string):
264
+ directories = string.split("/")
265
+ return re.sub(".json","", directories[len(directories)-1])
266
+
267
+
268
+ def getIDrange(rango_ID, df):
269
+ if rango_ID == "All":
270
+ IDs = list(range(len(df['hashed_id'])))
271
+ else:
272
+ rango = []
273
+ r= rango_ID.split(",")
274
+ for i in r:
275
+ c_w= clean_words(i)
276
+ if len(c_w) == 2:
277
+ rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
278
+ elif len(c_w) == 1:
279
+ rango.append(int(c_w[0]) -1)
280
+ IDs = rango
281
+
282
+ return IDs
283
+
284
+ def save_json(path, data, isIndent = True):
285
+ if isIndent:
286
+ json_object = json.dumps(data, indent = 11, ensure_ascii= False)
287
+ else:
288
+ json_object = json.dumps(data, ensure_ascii= False)
289
+ # Writing output to a json file
290
+ with open(path, "w") as outfile:
291
+ outfile.write(json_object)
292
+
293
+
294
+ def load_json(path):
295
+ with open(path, "r", encoding="utf8") as f:
296
+ data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
297
+
298
+ return data
299
+
300
+ def load_json_dtset(path):
301
+ with open(path, "r", encoding="latin-1") as f:
302
+ data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
303
+
304
+ return data
305
+
306
+
307
+ def splitResponse(respuesta_alumno_raw):
308
+ #pre-processing the student's response
309
+ regex = '\\\n'
310
+ respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
311
+ respuesta_alumno = respuesta_alumno.lower()
312
+
313
+ #stacking each sentence of the student's response
314
+ sentences=[]
315
+ TokenizeAnswer = sent_tokenize(respuesta_alumno)
316
+ for token in TokenizeAnswer:
317
+ regex = '\\.'
318
+ token = re.sub(regex , '', token)
319
+ sentences.append(token)
320
+
321
+ return sentences
322
+
323
+ def create_file_path(file, doctype):
324
+ """
325
+ This function is to create relative paths to store data.
326
+ Inputs:
327
+ file: the file or subpath + file where the info is to be stored
328
+ doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
329
+ Outputs:
330
+ path: the generated path
331
+ """
332
+ if doctype == 1:
333
+ path = "api/" + file
334
+ elif doctype == 2:
335
+ path = "archivos/OutputFiles2/" + file
336
+ elif doctype == 3:
337
+ path = "archivos/Images/" + file
338
+ else:
339
+ path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
340
  return path