Spaces:

xiomarablanco
/

plentas

Runtime error

App Files Files Community

xiomarablanco commited on Apr 11, 2023

Commit

155d7f2

•

1 Parent(s): 4714ef8

cambio 'es_core_news_sm' por '/path/to/es_core_news_sm'

Browse files

Files changed (1) hide show

codeScripts/utils.py +339 -338

codeScripts/utils.py CHANGED Viewed

@@ -1,339 +1,340 @@
-import json
-import numpy as np
-import hunspell
-import nltk
-import nltk.corpus
-from nltk.tokenize import sent_tokenize
-from nltk.tokenize import word_tokenize
-from nltk import ne_chunk
-import re
-import yake
-import spacy
-#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')
-nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
-# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)
-#class char():
-    #def __init__(self):
-       # pass
-class char_line():
-    def __init__(self, word):
-        self.word = word
-        self.char_line = [(char, self.char_type(char)) for char in word]
-        self.type_line = ''.join(chartype for char, chartype in self.char_line)
-    def char_type(self, char):
-        if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
-            return 'V' #strong vowel
-        if char in set(['i', 'u', 'ü']):
-            return 'v' #week vowel
-        if char=='x':
-            return 'x'
-        if char=='s':
-            return 's'
-        else:
-            return 'c'
-    def find(self, finder):
-        return self.type_line.find(finder)
-    def split(self, pos, where):
-        return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
-    def split_by(self, finder, where):
-        split_point = self.find(finder)
-        if split_point!=-1:
-            chl1, chl2 = self.split(split_point, where)
-            return chl1, chl2
-        return self, False
-    def __str__(self):
-        return self.word
-    def __repr__(self):
-        return repr(self.word)
-class silabizer():
-    def __init__(self):
-        self.grammar = []
-    def split(self, chars):
-        rules  = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
-        for split_rule, where in rules:
-            first, second = chars.split_by(split_rule,where)
-            if second:
-                if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
-                    #print 'skip1', first.word, second.word, split_rule, chars.type_line
-                    continue
-                if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
-                    continue
-                if first.word[-1]=='l' and second.word[-1]=='l':
-                    continue
-                if first.word[-1]=='r' and second.word[-1]=='r':
-                    continue
-                if first.word[-1]=='c' and second.word[-1]=='h':
-                    continue
-                return self.split(first)+self.split(second)
-        return [chars]
-    def __call__(self, word):
-        return self.split(char_line(word))
-# Contador número de frases y palabras empleadas en la respuesta
-def check_senteces_words(student_answer):
-    # Tokenizing into sentences
-    sentences=[]
-    words=[]
-    letter_per_word=[]
-    syll=0 # syllables counter
-    TokenizeAnswer = sent_tokenize(student_answer)
-    for token in TokenizeAnswer:
-        regex = '\\.'
-        token = re.sub(regex , '', token)
-        sentences.append(token)
-    for i in range(len(sentences)):
-        word = sentences[i].split(' ')
-        for j in range(len(word)):
-            words.append(word[j])
-            syllables = silabizer()
-            syll=syll+len(syllables(word[j]))
-            letter_per_word.append(len(word[j]))
-    sentencesLenght = len(sentences)
-    wordsLenght = (len(words))
-    #print(f'Number of senteces used in the answer: {sentencesLenght}')
-    #print(f'Number of words used in the answer: {wordsLenght}')
-    return sentencesLenght, wordsLenght, syll, letter_per_word
-# Contador faltas de ortografía
-def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):
-    dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
-    errors=0
-    words = student_answer.split(' ')
-    wrong_words = []
-    for word in words:
-        for element in clean_words(word):
-            if not dic.spell(element):
-                #print(f'Spelling mistake: {element}')
-                wrong_words.append(element)
-                errors+=1
-    #print(f'Spelling mistakes: {errors}')
-    return errors,wrong_words
-# Legibilidad de la respuesta en función del índice Fernández-Huerta
-def FHuertas_index(sentencesLenght, wordsLenght, syll):
-    FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght)
-    FH = round(FH, 3)
-    legibilidad_fh = ""
-    #print(f'\nFernández-Huerta Index: {FH}')
-    if 0 < FH <= 30:
-        #print('Legibilidad FH: muy difícil.')
-        legibilidad_fh = 'muy díficil'
-    if 30 < FH <= 50:
-        #print('Legibilidad FH: difícil.')
-        legibilidad_fh = 'díficil'
-    if 50 < FH <= 60:
-        #print('Legibilidad FH: ligeramente difícil.')
-        legibilidad_fh = 'ligeramente díficil'
-    if 60 < FH <= 70:
-        #print('Legibilidad FH: adecuado.')
-        legibilidad_fh = 'adecuado'
-    if 70 < FH <= 80:
-        #print('Legibilidad FH: ligeramente fácil.')
-        legibilidad_fh = 'ligeramente fácil'
-    if 80 < FH <= 90:
-        #print('Legibilidad FH: fácil.')
-        legibilidad_fh = 'fácil'
-    if 90 < FH <= 100:
-        #print('Legibilidad FH: muy fácil.')
-        legibilidad_fh = 'muy fácil'
-    return FH, legibilidad_fh
-# Legibilidad de la respuesta en función del índice mu
-def mu_index(sentencesLenght, wordsLenght, letter_per_word):
-    med = np.mean(letter_per_word)
-    var = np.var(letter_per_word)
-    mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
-    mu=round(mu, 3)
-    legibilidad_mu = ""
-    #print(f'\nMu index: {mu}')
-    if 0 < mu <= 30:
-        #print('Legibilidad Mu: muy difícil.')
-        legibilidad_mu = 'muy difícil'
-    if 30 < mu <= 50:
-        #print('Legibilidad Mu: difícil.')
-        legibilidad_mu = 'difícil'
-    if 50 < mu <= 60:
-        #print('Legibilidad Mu: ligeramente difícil.')
-        legibilidad_mu = 'ligeramente difícil'
-    if 60 < mu <= 70:
-        #print('Legibilidad Mu: adecuado.')
-        legibilidad_mu = 'adecuado'
-    if 70 < mu <= 80:
-        #print('Legibilidad Mu: ligeramente fácil.')
-        legibilidad_mu = 'ligeramente fácil'
-    if 80 < mu <= 90:
-        #print('Legibilidad Mu: fácil.')
-        legibilidad_mu = 'fácil'
-    if 90 < mu <= 100:
-        #print('Legibilidad Mu: muy fácil.')
-        legibilidad_mu = 'muy fácil'
-    return mu, legibilidad_mu
-# Extractor de las kewords de un texto con librería yake
-def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
-    test_keywords=[]
-    # Deleting special characters and set text in lower case
-    regex = '\\\n'
-    text = re.sub(regex , ' ', text)
-    text = text.lower()
-    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
-    keywords = custom_kw_extractor.extract_keywords(text)
-    for kw in keywords:
-        test_keywords.append(kw[0])
-    return test_keywords
-# categorización de palabras
-def word_categorization(student_answer):
-    fileDocument=[]
-    TokenizeAnswer = sent_tokenize(student_answer)
-    for token in TokenizeAnswer:
-        fileDocument.append(token)
-    sentencesLenght = len(fileDocument)
-    sentence=0
-    while sentence < sentencesLenght:
-        # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
-        word_tokens = word_tokenize(fileDocument[sentence])
-        doc = nlp(fileDocument[sentence])
-        pre_chunk = [(w.text, w.pos_) for w in doc]
-        #print(pre_chunk)
-        sentence += 1
-        #pre_chunk = nltk.pos_tag(word_tokens)
-        tree = ne_chunk(pre_chunk) # same tagging than before
-        #grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
-        # Chunking rules to filter out:
-        grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
-        grammar = r"""
-          NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and nouns
-              {<NNP>+}                # chunk sequences of proper nouns
-        """
-        chunk_parser = nltk.RegexpParser(grammer_np)
-        chunk_result = chunk_parser.parse(tree)
-#..................................................................................................
-def char_split(word, character):
-    palabra1=""
-    palabra2=""
-    found = 0
-    for w in word:
-        if w == character and not found:
-            found = 1
-        else:
-            if not found:
-              palabra1 = palabra1 + w
-            else:
-              palabra2 = palabra2 + w
-    return [palabra1, palabra2]
-def clean_words(string):
-    words_sentence = []
-    for w in string:
-      if not w.isalnum():
-        if char_split(string, w)[0] != "":
-            words_sentence.append(char_split(string, w)[0])
-        string = char_split(string, w)[len(char_split(string, w))-1]
-    if string != "":
-        words_sentence.append(string)
-    return words_sentence
-def getNameFile(string):
-    directories = string.split("/")
-    return re.sub(".json","", directories[len(directories)-1])
-def getIDrange(rango_ID, df):
-    if rango_ID == "All":
-        IDs = list(range(len(df['hashed_id'])))
-    else:
-        rango = []
-        r= rango_ID.split(",")
-        for i in r:
-            c_w= clean_words(i)
-            if len(c_w) == 2:
-                rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
-            elif len(c_w) == 1:
-                rango.append(int(c_w[0]) -1)
-        IDs = rango
-    return IDs
-def save_json(path, data, isIndent = True):
-    if isIndent:
-        json_object = json.dumps(data, indent = 11, ensure_ascii= False)
-    else:
-        json_object = json.dumps(data, ensure_ascii= False)
-    # Writing output to a json file
-    with open(path, "w") as outfile:
-        outfile.write(json_object)
-def load_json(path):
-    with open(path, "r", encoding="utf8") as f:
-        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
-    return data
-def load_json_dtset(path):
-    with open(path, "r", encoding="latin-1") as f:
-        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
-    return data
-def splitResponse(respuesta_alumno_raw):
-    #pre-processing the student's response
-    regex = '\\\n'
-    respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
-    respuesta_alumno = respuesta_alumno.lower()
-    #stacking each sentence of the student's response
-    sentences=[]
-    TokenizeAnswer = sent_tokenize(respuesta_alumno)
-    for token in TokenizeAnswer:
-        regex = '\\.'
-        token = re.sub(regex , '', token)
-        sentences.append(token)
-    return sentences
-def create_file_path(file, doctype):
-    """
-    This function is to create relative paths to store data.
-    Inputs:
-        file: the file or subpath + file where the info is to be stored
-        doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
-    Outputs:
-        path: the generated path
-    """
-    if doctype == 1:
-        path = "api/" + file
-    elif doctype == 2:
-        path = "archivos/OutputFiles2/" + file
-    elif doctype == 3:
-        path = "archivos/Images/" + file
-    else:
-        path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
     return path

+import json
+import numpy as np
+import hunspell
+import nltk
+import nltk.corpus
+from nltk.tokenize import sent_tokenize
+from nltk.tokenize import word_tokenize
+from nltk import ne_chunk
+import re
+import yake
+import spacy
+#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')
+nlp = spacy.load('/path/to/es_core_news_sm')
+#nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)
+# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)
+#class char():
+    #def __init__(self):
+       # pass
+class char_line():
+    def __init__(self, word):
+        self.word = word
+        self.char_line = [(char, self.char_type(char)) for char in word]
+        self.type_line = ''.join(chartype for char, chartype in self.char_line)
+    def char_type(self, char):
+        if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
+            return 'V' #strong vowel
+        if char in set(['i', 'u', 'ü']):
+            return 'v' #week vowel
+        if char=='x':
+            return 'x'
+        if char=='s':
+            return 's'
+        else:
+            return 'c'
+    def find(self, finder):
+        return self.type_line.find(finder)
+    def split(self, pos, where):
+        return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])
+    def split_by(self, finder, where):
+        split_point = self.find(finder)
+        if split_point!=-1:
+            chl1, chl2 = self.split(split_point, where)
+            return chl1, chl2
+        return self, False
+    def __str__(self):
+        return self.word
+    def __repr__(self):
+        return repr(self.word)
+class silabizer():
+    def __init__(self):
+        self.grammar = []
+    def split(self, chars):
+        rules  = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
+        for split_rule, where in rules:
+            first, second = chars.split_by(split_rule,where)
+            if second:
+                if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
+                    #print 'skip1', first.word, second.word, split_rule, chars.type_line
+                    continue
+                if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
+                    continue
+                if first.word[-1]=='l' and second.word[-1]=='l':
+                    continue
+                if first.word[-1]=='r' and second.word[-1]=='r':
+                    continue
+                if first.word[-1]=='c' and second.word[-1]=='h':
+                    continue
+                return self.split(first)+self.split(second)
+        return [chars]
+    def __call__(self, word):
+        return self.split(char_line(word))
+# Contador número de frases y palabras empleadas en la respuesta
+def check_senteces_words(student_answer):
+    # Tokenizing into sentences
+    sentences=[]
+    words=[]
+    letter_per_word=[]
+    syll=0 # syllables counter
+    TokenizeAnswer = sent_tokenize(student_answer)
+    for token in TokenizeAnswer:
+        regex = '\\.'
+        token = re.sub(regex , '', token)
+        sentences.append(token)
+    for i in range(len(sentences)):
+        word = sentences[i].split(' ')
+        for j in range(len(word)):
+            words.append(word[j])
+            syllables = silabizer()
+            syll=syll+len(syllables(word[j]))
+            letter_per_word.append(len(word[j]))
+    sentencesLenght = len(sentences)
+    wordsLenght = (len(words))
+    #print(f'Number of senteces used in the answer: {sentencesLenght}')
+    #print(f'Number of words used in the answer: {wordsLenght}')
+    return sentencesLenght, wordsLenght, syll, letter_per_word
+# Contador faltas de ortografía
+def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):
+    dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
+    errors=0
+    words = student_answer.split(' ')
+    wrong_words = []
+    for word in words:
+        for element in clean_words(word):
+            if not dic.spell(element):
+                #print(f'Spelling mistake: {element}')
+                wrong_words.append(element)
+                errors+=1
+    #print(f'Spelling mistakes: {errors}')
+    return errors,wrong_words
+# Legibilidad de la respuesta en función del índice Fernández-Huerta
+def FHuertas_index(sentencesLenght, wordsLenght, syll):
+    FH = 206.84 - 0.60*(syll*100/wordsLenght) - 1.02*(sentencesLenght*100/wordsLenght)
+    FH = round(FH, 3)
+    legibilidad_fh = ""
+    #print(f'\nFernández-Huerta Index: {FH}')
+    if 0 < FH <= 30:
+        #print('Legibilidad FH: muy difícil.')
+        legibilidad_fh = 'muy díficil'
+    if 30 < FH <= 50:
+        #print('Legibilidad FH: difícil.')
+        legibilidad_fh = 'díficil'
+    if 50 < FH <= 60:
+        #print('Legibilidad FH: ligeramente difícil.')
+        legibilidad_fh = 'ligeramente díficil'
+    if 60 < FH <= 70:
+        #print('Legibilidad FH: adecuado.')
+        legibilidad_fh = 'adecuado'
+    if 70 < FH <= 80:
+        #print('Legibilidad FH: ligeramente fácil.')
+        legibilidad_fh = 'ligeramente fácil'
+    if 80 < FH <= 90:
+        #print('Legibilidad FH: fácil.')
+        legibilidad_fh = 'fácil'
+    if 90 < FH <= 100:
+        #print('Legibilidad FH: muy fácil.')
+        legibilidad_fh = 'muy fácil'
+    return FH, legibilidad_fh
+# Legibilidad de la respuesta en función del índice mu
+def mu_index(sentencesLenght, wordsLenght, letter_per_word):
+    med = np.mean(letter_per_word)
+    var = np.var(letter_per_word)
+    mu=(wordsLenght/(wordsLenght-1))*(med/var)*100
+    mu=round(mu, 3)
+    legibilidad_mu = ""
+    #print(f'\nMu index: {mu}')
+    if 0 < mu <= 30:
+        #print('Legibilidad Mu: muy difícil.')
+        legibilidad_mu = 'muy difícil'
+    if 30 < mu <= 50:
+        #print('Legibilidad Mu: difícil.')
+        legibilidad_mu = 'difícil'
+    if 50 < mu <= 60:
+        #print('Legibilidad Mu: ligeramente difícil.')
+        legibilidad_mu = 'ligeramente difícil'
+    if 60 < mu <= 70:
+        #print('Legibilidad Mu: adecuado.')
+        legibilidad_mu = 'adecuado'
+    if 70 < mu <= 80:
+        #print('Legibilidad Mu: ligeramente fácil.')
+        legibilidad_mu = 'ligeramente fácil'
+    if 80 < mu <= 90:
+        #print('Legibilidad Mu: fácil.')
+        legibilidad_mu = 'fácil'
+    if 90 < mu <= 100:
+        #print('Legibilidad Mu: muy fácil.')
+        legibilidad_mu = 'muy fácil'
+    return mu, legibilidad_mu
+# Extractor de las kewords de un texto con librería yake
+def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
+    test_keywords=[]
+    # Deleting special characters and set text in lower case
+    regex = '\\\n'
+    text = re.sub(regex , ' ', text)
+    text = text.lower()
+    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
+    keywords = custom_kw_extractor.extract_keywords(text)
+    for kw in keywords:
+        test_keywords.append(kw[0])
+    return test_keywords
+# categorización de palabras
+def word_categorization(student_answer):
+    fileDocument=[]
+    TokenizeAnswer = sent_tokenize(student_answer)
+    for token in TokenizeAnswer:
+        fileDocument.append(token)
+    sentencesLenght = len(fileDocument)
+    sentence=0
+    while sentence < sentencesLenght:
+        # Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
+        word_tokens = word_tokenize(fileDocument[sentence])
+        doc = nlp(fileDocument[sentence])
+        pre_chunk = [(w.text, w.pos_) for w in doc]
+        #print(pre_chunk)
+        sentence += 1
+        #pre_chunk = nltk.pos_tag(word_tokens)
+        tree = ne_chunk(pre_chunk) # same tagging than before
+        #grammer_np = ("NP: {<DT>?<JJ>*<NN>}")
+        # Chunking rules to filter out:
+        grammer_np = ("NP: {<DET>?<ADJ>*<NOUN>*<VERB>}")
+        grammar = r"""
+          NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and nouns
+              {<NNP>+}                # chunk sequences of proper nouns
+        """
+        chunk_parser = nltk.RegexpParser(grammer_np)
+        chunk_result = chunk_parser.parse(tree)
+#..................................................................................................
+def char_split(word, character):
+    palabra1=""
+    palabra2=""
+    found = 0
+    for w in word:
+        if w == character and not found:
+            found = 1
+        else:
+            if not found:
+              palabra1 = palabra1 + w
+            else:
+              palabra2 = palabra2 + w
+    return [palabra1, palabra2]
+def clean_words(string):
+    words_sentence = []
+    for w in string:
+      if not w.isalnum():
+        if char_split(string, w)[0] != "":
+            words_sentence.append(char_split(string, w)[0])
+        string = char_split(string, w)[len(char_split(string, w))-1]
+    if string != "":
+        words_sentence.append(string)
+    return words_sentence
+def getNameFile(string):
+    directories = string.split("/")
+    return re.sub(".json","", directories[len(directories)-1])
+def getIDrange(rango_ID, df):
+    if rango_ID == "All":
+        IDs = list(range(len(df['hashed_id'])))
+    else:
+        rango = []
+        r= rango_ID.split(",")
+        for i in r:
+            c_w= clean_words(i)
+            if len(c_w) == 2:
+                rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
+            elif len(c_w) == 1:
+                rango.append(int(c_w[0]) -1)
+        IDs = rango
+    return IDs
+def save_json(path, data, isIndent = True):
+    if isIndent:
+        json_object = json.dumps(data, indent = 11, ensure_ascii= False)
+    else:
+        json_object = json.dumps(data, ensure_ascii= False)
+    # Writing output to a json file
+    with open(path, "w") as outfile:
+        outfile.write(json_object)
+def load_json(path):
+    with open(path, "r", encoding="utf8") as f:
+        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
+    return data
+def load_json_dtset(path):
+    with open(path, "r", encoding="latin-1") as f:
+        data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")
+    return data
+def splitResponse(respuesta_alumno_raw):
+    #pre-processing the student's response
+    regex = '\\\n'
+    respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
+    respuesta_alumno = respuesta_alumno.lower()
+    #stacking each sentence of the student's response
+    sentences=[]
+    TokenizeAnswer = sent_tokenize(respuesta_alumno)
+    for token in TokenizeAnswer:
+        regex = '\\.'
+        token = re.sub(regex , '', token)
+        sentences.append(token)
+    return sentences
+def create_file_path(file, doctype):
+    """
+    This function is to create relative paths to store data.
+    Inputs:
+        file: the file or subpath + file where the info is to be stored
+        doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
+    Outputs:
+        path: the generated path
+    """
+    if doctype == 1:
+        path = "api/" + file
+    elif doctype == 2:
+        path = "archivos/OutputFiles2/" + file
+    elif doctype == 3:
+        path = "archivos/Images/" + file
+    else:
+        path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
     return path