import re def drop_non_relevant_text(text_list): '''Based on the distribution of the len in the Spanish words, I decided to drop paragraphs composed mainly (over 50%) of too long words (len over 14 characters) This is to remove errors in the process to read PDFs ''' text_list = [x.split(' ') for x in text_list] relevant_sentences = [] counter = 0 for i in text_list: for j in i: if len(j)>14: counter+=1 if counter/len(i)<0.5: relevant_sentences+=[i] counter=0 return [' '.join(x).strip() for x in relevant_sentences] def preprocess_text(text): text=text.strip() text = re.sub(' +', ' ',text) text = re.sub('-', '',text) text = re.sub('-', '',text) text = re.sub('\n', '',text) return [x for x in text.split('.') if len(x)>1]