Spaces:
Runtime error
Runtime error
import json | |
import numpy as np | |
import hunspell | |
import nltk | |
import nltk.corpus | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
from nltk import ne_chunk | |
import re | |
import yake | |
import spacy | |
#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic') | |
nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es) | |
nltk.download('punkt') | |
# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py) | |
#class char(): | |
#def __init__(self): | |
# pass | |
class char_line(): | |
def __init__(self, word): | |
self.word = word | |
self.char_line = [(char, self.char_type(char)) for char in word] | |
self.type_line = ''.join(chartype for char, chartype in self.char_line) | |
def char_type(self, char): | |
if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']): | |
return 'V' #strong vowel | |
if char in set(['i', 'u', 'ü']): | |
return 'v' #week vowel | |
if char=='x': | |
return 'x' | |
if char=='s': | |
return 's' | |
else: | |
return 'c' | |
def find(self, finder): | |
return self.type_line.find(finder) | |
def split(self, pos, where): | |
return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:]) | |
def split_by(self, finder, where): | |
split_point = self.find(finder) | |
if split_point!=-1: | |
chl1, chl2 = self.split(split_point, where) | |
return chl1, chl2 | |
return self, False | |
def __str__(self): | |
return self.word | |
def __repr__(self): | |
return repr(self.word) | |
class silabizer(): | |
def __init__(self): | |
self.grammar = [] | |
def split(self, chars): | |
rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)] | |
for split_rule, where in rules: | |
first, second = chars.split_by(split_rule,where) | |
if second: | |
if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']): | |
#print 'skip1', first.word, second.word, split_rule, chars.type_line | |
continue | |
if first.type_line[-1]=='c' and second.word[0] in set(['l','r']): | |
continue | |
if first.word[-1]=='l' and second.word[-1]=='l': | |
continue | |
if first.word[-1]=='r' and second.word[-1]=='r': | |
continue | |
if first.word[-1]=='c' and second.word[-1]=='h': | |
continue | |
return self.split(first)+self.split(second) | |
return [chars] | |
def __call__(self, word): | |
return self.split(char_line(word)) | |
# Contador número de frases y palabras empleadas en la respuesta | |
def check_senteces_words(student_answer): | |
# Tokenizing into sentences | |
sentences=[] | |
words=[] | |
letter_per_word=[] | |
syll=0 # syllables counter | |
TokenizeAnswer = sent_tokenize(student_answer) | |
for token in TokenizeAnswer: | |
regex = '\\.' | |
token = re.sub(regex , '', token) | |
sentences.append(token) | |
for i in range(len(sentences)): | |
word = sentences[i].split(' ') | |
for j in range(len(word)): | |
words.append(word[j]) | |
syllables = silabizer() | |
syll=syll+len(syllables(word[j])) | |
letter_per_word.append(len(word[j])) | |
sentencesLenght = len(sentences) | |
wordsLenght = (len(words)) | |
#print(f'Number of senteces used in the answer: {sentencesLenght}') | |
#print(f'Number of words used in the answer: {wordsLenght}') | |
return sentencesLenght, wordsLenght, syll, letter_per_word | |
# Contador faltas de ortografía | |
def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ): | |
dic = hunspell.Hunspell(hunspell_aff, hunspell_dic) | |
errors=0 | |
words = student_answer.split(' ') | |
wrong_words = [] | |
for word in words: | |
for element in clean_words(word): | |
if not dic.spell(element): | |
#print(f'Spelling mistake: {element}') | |
wrong_words.append(element) | |
errors+=1 | |
#print(f'Spelling mistakes: {errors}') | |
return errors,wrong_words | |
# Legibilidad de la respuesta en función del índice Fernández-Huerta | |
def FHuertas_index(sentencesLenght, wordsLenght, syll): | |
FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght) | |
FH = round(FH, 3) | |
legibilidad_fh = "" | |
#print(f'\nFernández-Huerta Index: {FH}') | |
if 0 < FH <= 30: | |
#print('Legibilidad FH: muy difícil.') | |
legibilidad_fh = 'muy díficil' | |
if 30 < FH <= 50: | |
#print('Legibilidad FH: difícil.') | |
legibilidad_fh = 'díficil' | |
if 50 < FH <= 60: | |
#print('Legibilidad FH: ligeramente difícil.') | |
legibilidad_fh = 'ligeramente díficil' | |
if 60 < FH <= 70: | |
#print('Legibilidad FH: adecuado.') | |
legibilidad_fh = 'adecuado' | |
if 70 < FH <= 80: | |
#print('Legibilidad FH: ligeramente fácil.') | |
legibilidad_fh = 'ligeramente fácil' | |
if 80 < FH <= 90: | |
#print('Legibilidad FH: fácil.') | |
legibilidad_fh = 'fácil' | |
if 90 < FH <= 100: | |
#print('Legibilidad FH: muy fácil.') | |
legibilidad_fh = 'muy fácil' | |
return FH, legibilidad_fh | |
# Legibilidad de la respuesta en función del índice mu | |
def mu_index(sentencesLenght, wordsLenght, letter_per_word): | |
med = np.mean(letter_per_word) | |
var = np.var(letter_per_word) | |
mu=(wordsLenght/(wordsLenght-1))*(med/var)*100 | |
mu=round(mu, 3) | |
legibilidad_mu = "" | |
#print(f'\nMu index: {mu}') | |
if 0 < mu <= 30: | |
#print('Legibilidad Mu: muy difícil.') | |
legibilidad_mu = 'muy difícil' | |
if 30 < mu <= 50: | |
#print('Legibilidad Mu: difícil.') | |
legibilidad_mu = 'difícil' | |
if 50 < mu <= 60: | |
#print('Legibilidad Mu: ligeramente difícil.') | |
legibilidad_mu = 'ligeramente difícil' | |
if 60 < mu <= 70: | |
#print('Legibilidad Mu: adecuado.') | |
legibilidad_mu = 'adecuado' | |
if 70 < mu <= 80: | |
#print('Legibilidad Mu: ligeramente fácil.') | |
legibilidad_mu = 'ligeramente fácil' | |
if 80 < mu <= 90: | |
#print('Legibilidad Mu: fácil.') | |
legibilidad_mu = 'fácil' | |
if 90 < mu <= 100: | |
#print('Legibilidad Mu: muy fácil.') | |
legibilidad_mu = 'muy fácil' | |
return mu, legibilidad_mu | |
# Extractor de las kewords de un texto con librería yake | |
def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None): | |
test_keywords=[] | |
# Deleting special characters and set text in lower case | |
regex = '\\\n' | |
text = re.sub(regex , ' ', text) | |
text = text.lower() | |
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features ) | |
keywords = custom_kw_extractor.extract_keywords(text) | |
for kw in keywords: | |
test_keywords.append(kw[0]) | |
return test_keywords | |
# categorización de palabras | |
def word_categorization(student_answer): | |
fileDocument=[] | |
TokenizeAnswer = sent_tokenize(student_answer) | |
for token in TokenizeAnswer: | |
fileDocument.append(token) | |
sentencesLenght = len(fileDocument) | |
sentence=0 | |
while sentence < sentencesLenght: | |
# Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...) | |
word_tokens = word_tokenize(fileDocument[sentence]) | |
doc = nlp(fileDocument[sentence]) | |
pre_chunk = [(w.text, w.pos_) for w in doc] | |
#print(pre_chunk) | |
sentence += 1 | |
#pre_chunk = nltk.pos_tag(word_tokens) | |
tree = ne_chunk(pre_chunk) # same tagging than before | |
#grammer_np = ("NP: {<DT>?<JJ>*<NN>}") | |
# Chunking rules to filter out: | |
grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}") | |
grammar = r""" | |
NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns | |
{<NNP>+} # chunk sequences of proper nouns | |
""" | |
chunk_parser = nltk.RegexpParser(grammer_np) | |
chunk_result = chunk_parser.parse(tree) | |
#.................................................................................................. | |
def char_split(word, character): | |
palabra1="" | |
palabra2="" | |
found = 0 | |
for w in word: | |
if w == character and not found: | |
found = 1 | |
else: | |
if not found: | |
palabra1 = palabra1 + w | |
else: | |
palabra2 = palabra2 + w | |
return [palabra1, palabra2] | |
def clean_words(string): | |
words_sentence = [] | |
for w in string: | |
if not w.isalnum(): | |
if char_split(string, w)[0] != "": | |
words_sentence.append(char_split(string, w)[0]) | |
string = char_split(string, w)[len(char_split(string, w))-1] | |
if string != "": | |
words_sentence.append(string) | |
return words_sentence | |
def getNameFile(string): | |
directories = string.split("/") | |
return re.sub(".json","", directories[len(directories)-1]) | |
def getIDrange(rango_ID, df): | |
if rango_ID == "All": | |
IDs = list(range(len(df['hashed_id']))) | |
else: | |
rango = [] | |
r= rango_ID.split(",") | |
for i in r: | |
c_w= clean_words(i) | |
if len(c_w) == 2: | |
rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1]))) | |
elif len(c_w) == 1: | |
rango.append(int(c_w[0]) -1) | |
IDs = rango | |
return IDs | |
def save_json(path, data, isIndent = True): | |
if isIndent: | |
json_object = json.dumps(data, indent = 11, ensure_ascii= False) | |
else: | |
json_object = json.dumps(data, ensure_ascii= False) | |
# Writing output to a json file | |
with open(path, "w") as outfile: | |
outfile.write(json_object) | |
def load_json(path): | |
with open(path, "r", encoding="utf8") as f: | |
data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]") | |
return data | |
def load_json_dtset(path): | |
with open(path, "r", encoding="latin-1") as f: | |
data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]") | |
return data | |
def splitResponse(respuesta_alumno_raw): | |
#pre-processing the student's response | |
regex = '\\\n' | |
respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw) | |
respuesta_alumno = respuesta_alumno.lower() | |
#stacking each sentence of the student's response | |
sentences=[] | |
TokenizeAnswer = sent_tokenize(respuesta_alumno) | |
for token in TokenizeAnswer: | |
regex = '\\.' | |
token = re.sub(regex , '', token) | |
sentences.append(token) | |
return sentences | |
def create_file_path(file, doctype): | |
""" | |
This function is to create relative paths to store data. | |
Inputs: | |
file: the file or subpath + file where the info is to be stored | |
doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents | |
Outputs: | |
path: the generated path | |
""" | |
if doctype == 1: | |
path = "api/" + file | |
elif doctype == 2: | |
path = "archivos/OutputFiles2/" + file | |
elif doctype == 3: | |
path = "archivos/Images/" + file | |
else: | |
path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file | |
return path |