Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk import word_tokenize,pos_tag | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import wordnet | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| def tokenize(sentence): | |
| sentence = re.sub(r'\s+', ' ', sentence) | |
| token_words = word_tokenize(sentence) | |
| token_words = pos_tag(token_words) | |
| return token_words | |
| wordnet_lematizer = WordNetLemmatizer() | |
| def stem(token_words): | |
| words_lematizer = [] | |
| for word, tag in token_words: | |
| if tag.startswith('NN'): | |
| word_lematizer = wordnet_lematizer.lemmatize(word, pos='n') | |
| elif tag.startswith('VB'): | |
| word_lematizer = wordnet_lematizer.lemmatize(word, pos='v') | |
| elif tag.startswith('JJ'): | |
| word_lematizer = wordnet_lematizer.lemmatize(word, pos='a') | |
| elif tag.startswith('R'): | |
| word_lematizer = wordnet_lematizer.lemmatize(word, pos='r') | |
| else: | |
| word_lematizer = wordnet_lematizer.lemmatize(word) | |
| words_lematizer.append(word_lematizer) | |
| return words_lematizer | |
| def delete_invalid_word(token_words): | |
| valid_word = [] | |
| for word in token_words: | |
| if len(wordnet.synsets(word)) > 0: | |
| valid_word.append(word) | |
| return valid_word | |
| sr = stopwords.words('english') | |
| sr.append("limited") | |
| sr.append("additionnaly") | |
| sr.append("e.g") | |
| sr.remove("other") | |
| sr.remove("than") | |
| sr.remove("not") | |
| sr.remove("you") | |
| sr.remove("and") | |
| sr2 = stopwords.words('english') | |
| def delete_stopwords(token_words): | |
| cleaned_words = [word for word in token_words if word not in sr] | |
| return cleaned_words | |
| def delete_stopwords2(token_words): | |
| cleaned_words = [word for word in token_words if word not in sr2] | |
| return cleaned_words | |
| def delete_adjwords(token_words): | |
| cleaned_words = [word for word in token_words if word not in sr] | |
| return cleaned_words | |
| def is_number(s): | |
| try: | |
| float(s) | |
| return True | |
| except ValueError: | |
| pass | |
| try: | |
| import unicodedata | |
| unicodedata.numeric(s) | |
| return True | |
| except (TypeError, ValueError): | |
| pass | |
| return False | |
| characters_title = [' ','.',',','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] | |
| characters = [' ','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] | |
| characters_proposal = [' ','|' , '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}'] | |
| def delete_characters(token_words): | |
| words_list = [word for word in token_words if word not in characters] | |
| return words_list | |
| def delete_characters_proposal(token_words): | |
| words_list = [word for word in token_words if word not in characters_proposal and not is_number(word)] | |
| return words_list | |
| def delete_characters_title(token_words): | |
| words_list = [word for word in token_words if word not in characters and not is_number(word)] | |
| return words_list | |
| def to_lower(token_words): | |
| words_lists = [x.lower() for x in token_words] | |
| return words_lists | |
| def pre_process_title(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_invalid_word(token_words) | |
| token_words = delete_characters_title(token_words) | |
| token_words = to_lower(token_words) | |
| return ' '.join(token_words) | |
| def pre_process(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_stopwords(token_words) | |
| token_words = delete_characters(token_words) | |
| token_words = to_lower(token_words) | |
| return ' '.join(token_words) | |
| def pre_process_type(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_stopwords2(token_words) | |
| token_words = delete_characters(token_words) | |
| token_words = to_lower(token_words) | |
| return ' '.join(token_words) | |
| def pre_process_proposal(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_stopwords(token_words) | |
| token_words = delete_characters_proposal(token_words) | |
| token_words = to_lower(token_words) | |
| return ' '.join(token_words) | |
| def pre_process_list(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_stopwords(token_words) | |
| token_words = delete_characters(token_words) | |
| token_words = to_lower(token_words) | |
| return token_words | |
| def pre_process_stop(text): | |
| token_words = tokenize(text) | |
| token_words = stem(token_words) | |
| token_words = delete_characters(token_words) | |
| token_words = to_lower(token_words) | |
| text = ' '.join(token_words) | |
| final_text = text.split(".") | |
| return final_text | |