Spaces:

tsantos
/

Hierarchical-Classification-System-for-Breast-Cancer

Runtime error

App Files Files Community

tsantos commited on Apr 26, 2022

Commit

00ddc44

•

1 Parent(s): d63f0d1

Update text_cleaning_transforerms.py

Browse files

Files changed (1) hide show

text_cleaning_transforerms.py +28 -15

text_cleaning_transforerms.py CHANGED Viewed

@@ -4,10 +4,6 @@ from os import listdir
 from os.path import isfile, join
 import numpy as np
 import re
-import nltk
-nltk.download('punkt')
-nltk.download('stopwords')
-nltk.download('wordnet')
 from gensim.parsing import preprocessing
 from gensim.parsing.preprocessing import strip_tags, strip_punctuation
@@ -181,6 +177,19 @@ def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False):
   return clean_t
 # set only_data = True if no need to get scores or if dataaset doesn't have a score
 def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
@@ -194,12 +203,18 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
   data_pre_processed_chunks,sample = [],""
   # Were able to split into sentences
-  if len(sentences)>1:
     for index,sentence in enumerate(sentences):
       if len(sentence.split()) + len(sample.split()) <= max_size:
         sample += sentence
       else:
-        data_pre_processed_chunks.append(text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation))
         sample = sentence if index < len(sentences)-1 else ""
     if len(sample) ==0:
@@ -208,18 +223,16 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
       clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
     #if len(clean_data.split()) >3:
-    data_pre_processed_chunks.append(clean_data)
   # Split by get max size chunks
   else:
-    words = word_tokenize(data)
-    lower_b, upper_b = 0, max_size
-    for x in range(math.ceil(len(words)/max_size)):
-        sample = " ".join(x for x in words[lower_b:upper_b])
-        lower_b, upper_b = upper_b, upper_b+max_size
-        clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
-        #if len(clean_data.split()) >3:
-        data_pre_processed_chunks.append(clean_data)
   # return the pre_processed of whoole text and chunks
   return data_pre_processed,data_pre_processed_chunks

 from os.path import isfile, join
 import numpy as np
 import re
 from gensim.parsing import preprocessing
 from gensim.parsing.preprocessing import strip_tags, strip_punctuation
   return clean_t
+def split_by_chuncks(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
+  pre_processed_chunks = []
+  words = word_tokenize(data)
+  lower_b, upper_b = 0, max_size
+  for x in range(math.ceil(len(words)/max_size)):
+    sample = " ".join(x for x in words[lower_b:upper_b])
+    lower_b, upper_b = upper_b, upper_b+max_size
+    clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+    pre_processed_chunks.append(clean_data)
+  return pre_processed_chunks
 # set only_data = True if no need to get scores or if dataaset doesn't have a score
 def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
   data_pre_processed_chunks,sample = [],""
   # Were able to split into sentences
+  if len(sentences)>2:
     for index,sentence in enumerate(sentences):
       if len(sentence.split()) + len(sample.split()) <= max_size:
         sample += sentence
       else:
+        if len(sample.split())>1:
+          clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+          if len(clean_data.split()) > max_size:
+            pre_processed_chunks = split_by_chuncks(sample,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+            data_pre_processed_chunks.extend(pre_processed_chunks)
+          else:
+            data_pre_processed_chunks.append(clean_data)
         sample = sentence if index < len(sentences)-1 else ""
     if len(sample) ==0:
       clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
     #if len(clean_data.split()) >3:
+    if len(clean_data.split()) > max_size:
+      pre_processed_chunks = split_by_chuncks(clean_data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+      data_pre_processed_chunks.extend(pre_processed_chunks)
+    else:
+      data_pre_processed_chunks.append(clean_data)
   # Split by get max size chunks
   else:
+    pre_processed_chunks = split_by_chuncks(data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+    data_pre_processed_chunks.extend(pre_processed_chunks)
   # return the pre_processed of whoole text and chunks
   return data_pre_processed,data_pre_processed_chunks