tsantos commited on
Commit
00ddc44
1 Parent(s): d63f0d1

Update text_cleaning_transforerms.py

Browse files
Files changed (1) hide show
  1. text_cleaning_transforerms.py +28 -15
text_cleaning_transforerms.py CHANGED
@@ -4,10 +4,6 @@ from os import listdir
4
  from os.path import isfile, join
5
  import numpy as np
6
  import re
7
- import nltk
8
- nltk.download('punkt')
9
- nltk.download('stopwords')
10
- nltk.download('wordnet')
11
 
12
  from gensim.parsing import preprocessing
13
  from gensim.parsing.preprocessing import strip_tags, strip_punctuation
@@ -181,6 +177,19 @@ def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False):
181
 
182
  return clean_t
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # set only_data = True if no need to get scores or if dataaset doesn't have a score
185
  def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
186
 
@@ -194,12 +203,18 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
194
  data_pre_processed_chunks,sample = [],""
195
 
196
  # Were able to split into sentences
197
- if len(sentences)>1:
198
  for index,sentence in enumerate(sentences):
199
  if len(sentence.split()) + len(sample.split()) <= max_size:
200
  sample += sentence
201
  else:
202
- data_pre_processed_chunks.append(text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation))
 
 
 
 
 
 
203
  sample = sentence if index < len(sentences)-1 else ""
204
 
205
  if len(sample) ==0:
@@ -208,18 +223,16 @@ def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuat
208
  clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
209
 
210
  #if len(clean_data.split()) >3:
211
- data_pre_processed_chunks.append(clean_data)
 
 
 
 
212
 
213
  # Split by get max size chunks
214
  else:
215
- words = word_tokenize(data)
216
- lower_b, upper_b = 0, max_size
217
- for x in range(math.ceil(len(words)/max_size)):
218
- sample = " ".join(x for x in words[lower_b:upper_b])
219
- lower_b, upper_b = upper_b, upper_b+max_size
220
- clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
221
- #if len(clean_data.split()) >3:
222
- data_pre_processed_chunks.append(clean_data)
223
 
224
  # return the pre_processed of whoole text and chunks
225
  return data_pre_processed,data_pre_processed_chunks
 
4
  from os.path import isfile, join
5
  import numpy as np
6
  import re
 
 
 
 
7
 
8
  from gensim.parsing import preprocessing
9
  from gensim.parsing.preprocessing import strip_tags, strip_punctuation
 
177
 
178
  return clean_t
179
 
180
+ def split_by_chuncks(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
181
+ pre_processed_chunks = []
182
+ words = word_tokenize(data)
183
+ lower_b, upper_b = 0, max_size
184
+ for x in range(math.ceil(len(words)/max_size)):
185
+ sample = " ".join(x for x in words[lower_b:upper_b])
186
+ lower_b, upper_b = upper_b, upper_b+max_size
187
+ clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
188
+
189
+ pre_processed_chunks.append(clean_data)
190
+
191
+ return pre_processed_chunks
192
+
193
  # set only_data = True if no need to get scores or if dataaset doesn't have a score
194
  def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
195
 
 
203
  data_pre_processed_chunks,sample = [],""
204
 
205
  # Were able to split into sentences
206
+ if len(sentences)>2:
207
  for index,sentence in enumerate(sentences):
208
  if len(sentence.split()) + len(sample.split()) <= max_size:
209
  sample += sentence
210
  else:
211
+ if len(sample.split())>1:
212
+ clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
213
+ if len(clean_data.split()) > max_size:
214
+ pre_processed_chunks = split_by_chuncks(sample,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
215
+ data_pre_processed_chunks.extend(pre_processed_chunks)
216
+ else:
217
+ data_pre_processed_chunks.append(clean_data)
218
  sample = sentence if index < len(sentences)-1 else ""
219
 
220
  if len(sample) ==0:
 
223
  clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
224
 
225
  #if len(clean_data.split()) >3:
226
+ if len(clean_data.split()) > max_size:
227
+ pre_processed_chunks = split_by_chuncks(clean_data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
228
+ data_pre_processed_chunks.extend(pre_processed_chunks)
229
+ else:
230
+ data_pre_processed_chunks.append(clean_data)
231
 
232
  # Split by get max size chunks
233
  else:
234
+ pre_processed_chunks = split_by_chuncks(data,min_lenght=min_lenght,max_size=max_size, extra_clean=extra_clean, remove_punctuation=remove_punctuation)
235
+ data_pre_processed_chunks.extend(pre_processed_chunks)
 
 
 
 
 
 
236
 
237
  # return the pre_processed of whoole text and chunks
238
  return data_pre_processed,data_pre_processed_chunks