import contractions import spacy import nltk import pickle from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer from keras_preprocessing.sequence import pad_sequences nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') nlp = spacy.load("en_core_web_sm") stop_words = set(stopwords.words('english')) def text_transform(string_text): with open('model/tokenizer.pickle', 'rb') as handle: loaded_tokenizer = pickle.load(handle) string_text_list = [] string_text_list.append(string_text) sequences = loaded_tokenizer.texts_to_sequences(string_text_list) padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post') return padded_sequences # python -m spacy download en_core_web_sm # pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character import re # pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character def get_main_words(string_text): tokens = nltk.word_tokenize(string_text) pos_tags = nltk.pos_tag(tokens) pos_string = "{'JJR', 'VB', 'WP', 'WRB', 'NNS', 'JJS', 'JJ', 'RB', 'MD', 'VBZ', 'VBG', 'VBP'}" words = re.findall(r"'(\w+)'", pos_string) string_list = [token for token, tag in pos_tags if tag in words] if string_list: string_list = ' '.join(string_list) return string_list return None # complex pre-processing data def pre_processing_data_2(string_text): string_text = string_text.lower() string_output = ' '.join([token.lemma_ for token in nlp(string_text)]) string_output = contractions.fix(string_output) string_processed = get_main_words(string_output) if string_processed: tokenizer = RegexpTokenizer(r'\w+') string_processed = tokenizer.tokenize(string_processed) string_processed = " ".join(string_processed) return string_processed tokenizer = RegexpTokenizer(r'\w+') string_output = tokenizer.tokenize(string_output) string_output = [w for w in string_output if not w in stop_words] string_output = " ".join(string_output) return string_output def preprocessing_data(string_text): string_text = string_text.lower() string_output = ' '.join([token.lemma_ for token in nlp(string_text)]) string_output = contractions.fix(string_output) tokenizer = RegexpTokenizer(r'\w+') string_output = tokenizer.tokenize(string_output) string_output = [w for w in string_output if not w in stop_words] string_output = " ".join(string_output) return string_output