plentas / codeScripts /utils.py
xiomarablanco's picture
nltk.download('punkt')
e228d0e
raw
history blame
12.2 kB
import json
import numpy as np
import hunspell
import nltk
import nltk.corpus
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
import re
import yake
import spacy
#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')
nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
nltk.download('punkt')
# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)
#class char():
#def __init__(self):
# pass
class char_line():
def __init__(self, word):
self.word = word
self.char_line = [(char, self.char_type(char)) for char in word]
self.type_line = ''.join(chartype for char, chartype in self.char_line)
def char_type(self, char):
if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
return 'V' #strong vowel
if char in set(['i', 'u', 'ü']):
return 'v' #week vowel
if char=='x':
return 'x'
if char=='s':
return 's'
else:
return 'c'
def find(self, finder):
return self.type_line.find(finder)
def split(self, pos, where):
return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
def split_by(self, finder, where):
split_point = self.find(finder)
if split_point!=-1:
chl1, chl2 = self.split(split_point, where)
return chl1, chl2
return self, False
def __str__(self):
return self.word
def __repr__(self):
return repr(self.word)
class silabizer():
def __init__(self):
self.grammar = []
def split(self, chars):
rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
for split_rule, where in rules:
first, second = chars.split_by(split_rule,where)
if second:
if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
#print 'skip1', first.word, second.word, split_rule, chars.type_line
continue
if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
continue
if first.word[-1]=='l' and second.word[-1]=='l':
continue
if first.word[-1]=='r' and second.word[-1]=='r':
continue
if first.word[-1]=='c' and second.word[-1]=='h':
continue
return self.split(first)+self.split(second)
return [chars]
def __call__(self, word):
return self.split(char_line(word))
# Contador número de frases y palabras empleadas en la respuesta
def check_senteces_words(student_answer):
# Tokenizing into sentences
sentences=[]
words=[]
letter_per_word=[]
syll=0 # syllables counter
TokenizeAnswer = sent_tokenize(student_answer)
for token in TokenizeAnswer:
regex = '\\.'
token = re.sub(regex , '', token)
sentences.append(token)
for i in range(len(sentences)):
word = sentences[i].split(' ')
for j in range(len(word)):
words.append(word[j])
syllables = silabizer()
syll=syll+len(syllables(word[j]))
letter_per_word.append(len(word[j]))
sentencesLenght = len(sentences)
wordsLenght = (len(words))
#print(f'Number of senteces used in the answer: {sentencesLenght}')
#print(f'Number of words used in the answer: {wordsLenght}')
return sentencesLenght, wordsLenght, syll, letter_per_word
# Contador faltas de ortografía
def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):
dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
errors=0
words = student_answer.split(' ')
wrong_words = []
for word in words:
for element in clean_words(word):
if not dic.spell(element):
#print(f'Spelling mistake: {element}')
wrong_words.append(element)
errors+=1
#print(f'Spelling mistakes: {errors}')
return errors,wrong_words
# Legibilidad de la respuesta en función del índice Fernández-Huerta
def FHuertas_index(sentencesLenght, wordsLenght, syll):
FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght)
FH = round(FH, 3)
legibilidad_fh = ""
#print(f'\nFernández-Huerta Index: {FH}')
if 0 < FH <= 30:
#print('Legibilidad FH: muy difícil.')
legibilidad_fh = 'muy díficil'
if 30 < FH <= 50:
#print('Legibilidad FH: difícil.')
legibilidad_fh = 'díficil'
if 50 < FH <= 60:
#print('Legibilidad FH: ligeramente difícil.')
legibilidad_fh = 'ligeramente díficil'
if 60 < FH <= 70:
#print('Legibilidad FH: adecuado.')
legibilidad_fh = 'adecuado'
if 70 < FH <= 80:
#print('Legibilidad FH: ligeramente fácil.')
legibilidad_fh = 'ligeramente fácil'
if 80 < FH <= 90:
#print('Legibilidad FH: fácil.')
legibilidad_fh = 'fácil'
if 90 < FH <= 100:
#print('Legibilidad FH: muy fácil.')
legibilidad_fh = 'muy fácil'
return FH, legibilidad_fh
# Legibilidad de la respuesta en función del índice mu
def mu_index(sentencesLenght, wordsLenght, letter_per_word):
med = np.mean(letter_per_word)
var = np.var(letter_per_word)
mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
mu=round(mu, 3)
legibilidad_mu = ""
#print(f'\nMu index: {mu}')
if 0 < mu <= 30:
#print('Legibilidad Mu: muy difícil.')
legibilidad_mu = 'muy difícil'
if 30 < mu <= 50:
#print('Legibilidad Mu: difícil.')
legibilidad_mu = 'difícil'
if 50 < mu <= 60:
#print('Legibilidad Mu: ligeramente difícil.')
legibilidad_mu = 'ligeramente difícil'
if 60 < mu <= 70:
#print('Legibilidad Mu: adecuado.')
legibilidad_mu = 'adecuado'
if 70 < mu <= 80:
#print('Legibilidad Mu: ligeramente fácil.')
legibilidad_mu = 'ligeramente fácil'
if 80 < mu <= 90:
#print('Legibilidad Mu: fácil.')
legibilidad_mu = 'fácil'
if 90 < mu <= 100:
#print('Legibilidad Mu: muy fácil.')
legibilidad_mu = 'muy fácil'
return mu, legibilidad_mu
# Extractor de las kewords de un texto con librería yake
def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
test_keywords=[]
# Deleting special characters and set text in lower case
regex = '\\\n'
text = re.sub(regex , ' ', text)
text = text.lower()
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
keywords = custom_kw_extractor.extract_keywords(text)
for kw in keywords:
test_keywords.append(kw[0])
return test_keywords
# categorización de palabras
def word_categorization(student_answer):
fileDocument=[]
TokenizeAnswer = sent_tokenize(student_answer)
for token in TokenizeAnswer:
fileDocument.append(token)
sentencesLenght = len(fileDocument)
sentence=0
while sentence < sentencesLenght:
# Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
word_tokens = word_tokenize(fileDocument[sentence])
doc = nlp(fileDocument[sentence])
pre_chunk = [(w.text, w.pos_) for w in doc]
#print(pre_chunk)
sentence += 1
#pre_chunk = nltk.pos_tag(word_tokens)
tree = ne_chunk(pre_chunk) # same tagging than before
#grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
# Chunking rules to filter out:
grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
{<NNP>+} # chunk sequences of proper nouns
"""
chunk_parser = nltk.RegexpParser(grammer_np)
chunk_result = chunk_parser.parse(tree)
#..................................................................................................
def char_split(word, character):
palabra1=""
palabra2=""
found = 0
for w in word:
if w == character and not found:
found = 1
else:
if not found:
palabra1 = palabra1 + w
else:
palabra2 = palabra2 + w
return [palabra1, palabra2]
def clean_words(string):
words_sentence = []
for w in string:
if not w.isalnum():
if char_split(string, w)[0] != "":
words_sentence.append(char_split(string, w)[0])
string = char_split(string, w)[len(char_split(string, w))-1]
if string != "":
words_sentence.append(string)
return words_sentence
def getNameFile(string):
directories = string.split("/")
return re.sub(".json","", directories[len(directories)-1])
def getIDrange(rango_ID, df):
if rango_ID == "All":
IDs = list(range(len(df['hashed_id'])))
else:
rango = []
r= rango_ID.split(",")
for i in r:
c_w= clean_words(i)
if len(c_w) == 2:
rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
elif len(c_w) == 1:
rango.append(int(c_w[0]) -1)
IDs = rango
return IDs
def save_json(path, data, isIndent = True):
if isIndent:
json_object = json.dumps(data, indent = 11, ensure_ascii= False)
else:
json_object = json.dumps(data, ensure_ascii= False)
# Writing output to a json file
with open(path, "w") as outfile:
outfile.write(json_object)
def load_json(path):
with open(path, "r", encoding="utf8") as f:
data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
return data
def load_json_dtset(path):
with open(path, "r", encoding="latin-1") as f:
data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
return data
def splitResponse(respuesta_alumno_raw):
#pre-processing the student's response
regex = '\\\n'
respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
respuesta_alumno = respuesta_alumno.lower()
#stacking each sentence of the student's response
sentences=[]
TokenizeAnswer = sent_tokenize(respuesta_alumno)
for token in TokenizeAnswer:
regex = '\\.'
token = re.sub(regex , '', token)
sentences.append(token)
return sentences
def create_file_path(file, doctype):
"""
This function is to create relative paths to store data.
Inputs:
file: the file or subpath + file where the info is to be stored
doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
Outputs:
path: the generated path
"""
if doctype == 1:
path = "api/" + file
elif doctype == 2:
path = "archivos/OutputFiles2/" + file
elif doctype == 3:
path = "archivos/Images/" + file
else:
path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
return path