Update text_cleaning_transforerms.py
Browse files- text_cleaning_transforerms.py +28 -15
text_cleaning_transforerms.py
CHANGED
@@ -4,10 +4,6 @@ from os import listdir
|
|
4 |
from os.path import isfile, join
|
5 |
import numpy as np
|
6 |
import re
|
7 |
-
import nltk
|
8 |
-
nltk.download('punkt')
|
9 |
-
nltk.download('stopwords')
|
10 |
-
nltk.download('wordnet')
|
11 |
|
12 |
from gensim.parsing import preprocessing
|
13 |
from gensim.parsing.preprocessing import strip_tags, strip_punctuation
|
@@ -181,6 +177,19 @@ def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False):
|
|
181 |
|
182 |
return clean_t
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
# set only_data = True if no need to get scores or if dataaset doesn't have a score
|
185 |
def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
|
186 |
|
@@ -194,12 +203,18 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
|
|
194 |
data_pre_processed_chunks,sample = [],""
|
195 |
|
196 |
# Were able to split into sentences
|
197 |
-
if len(sentences)>
|
198 |
for index,sentence in enumerate(sentences):
|
199 |
if len(sentence.split()) + len(sample.split()) <= max_size:
|
200 |
sample += sentence
|
201 |
else:
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
sample = sentence if index < len(sentences)-1 else ""
|
204 |
|
205 |
if len(sample) ==0:
|
@@ -208,18 +223,16 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
|
|
208 |
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
209 |
|
210 |
#if len(clean_data.split()) >3:
|
211 |
-
|
|
|
|
|
|
|
|
|
212 |
|
213 |
# Split by get max size chunks
|
214 |
else:
|
215 |
-
|
216 |
-
|
217 |
-
for x in range(math.ceil(len(words)/max_size)):
|
218 |
-
sample = " ".join(x for x in words[lower_b:upper_b])
|
219 |
-
lower_b, upper_b = upper_b, upper_b+max_size
|
220 |
-
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
221 |
-
#if len(clean_data.split()) >3:
|
222 |
-
data_pre_processed_chunks.append(clean_data)
|
223 |
|
224 |
# return the pre_processed of whoole text and chunks
|
225 |
return data_pre_processed,data_pre_processed_chunks
|
|
|
4 |
from os.path import isfile, join
|
5 |
import numpy as np
|
6 |
import re
|
|
|
|
|
|
|
|
|
7 |
|
8 |
from gensim.parsing import preprocessing
|
9 |
from gensim.parsing.preprocessing import strip_tags, strip_punctuation
|
|
|
177 |
|
178 |
return clean_t
|
179 |
|
180 |
+
def split_by_chuncks(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
|
181 |
+
pre_processed_chunks = []
|
182 |
+
words = word_tokenize(data)
|
183 |
+
lower_b, upper_b = 0, max_size
|
184 |
+
for x in range(math.ceil(len(words)/max_size)):
|
185 |
+
sample = " ".join(x for x in words[lower_b:upper_b])
|
186 |
+
lower_b, upper_b = upper_b, upper_b+max_size
|
187 |
+
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
188 |
+
|
189 |
+
pre_processed_chunks.append(clean_data)
|
190 |
+
|
191 |
+
return pre_processed_chunks
|
192 |
+
|
193 |
# set only_data = True if no need to get scores or if dataaset doesn't have a score
|
194 |
def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
|
195 |
|
|
|
203 |
data_pre_processed_chunks,sample = [],""
|
204 |
|
205 |
# Were able to split into sentences
|
206 |
+
if len(sentences)>2:
|
207 |
for index,sentence in enumerate(sentences):
|
208 |
if len(sentence.split()) + len(sample.split()) <= max_size:
|
209 |
sample += sentence
|
210 |
else:
|
211 |
+
if len(sample.split())>1:
|
212 |
+
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
213 |
+
if len(clean_data.split()) > max_size:
|
214 |
+
pre_processed_chunks = split_by_chuncks(sample,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
215 |
+
data_pre_processed_chunks.extend(pre_processed_chunks)
|
216 |
+
else:
|
217 |
+
data_pre_processed_chunks.append(clean_data)
|
218 |
sample = sentence if index < len(sentences)-1 else ""
|
219 |
|
220 |
if len(sample) ==0:
|
|
|
223 |
clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
224 |
|
225 |
#if len(clean_data.split()) >3:
|
226 |
+
if len(clean_data.split()) > max_size:
|
227 |
+
pre_processed_chunks = split_by_chuncks(clean_data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
228 |
+
data_pre_processed_chunks.extend(pre_processed_chunks)
|
229 |
+
else:
|
230 |
+
data_pre_processed_chunks.append(clean_data)
|
231 |
|
232 |
# Split by get max size chunks
|
233 |
else:
|
234 |
+
pre_processed_chunks = split_by_chuncks(data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
|
235 |
+
data_pre_processed_chunks.extend(pre_processed_chunks)
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
# return the pre_processed of whoole text and chunks
|
238 |
return data_pre_processed,data_pre_processed_chunks
|