emotion_classification_surreynlp_2023 / pre_processing_data.py
davidlee1102
First commit
d35f33e
raw history blame
No virus
2.73 kB
import contractions
import spacy
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from keras_preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))
def text_transform(string_text):
with open('model/tokenizer.pickle', 'rb') as handle:
loaded_tokenizer = pickle.load(handle)
string_text_list = []
string_text_list.append(string_text)
sequences = loaded_tokenizer.texts_to_sequences(string_text_list)
padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')
return padded_sequences
# python -m spacy download en_core_web_sm
# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
import re
# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
def get_main_words(string_text):
tokens = nltk.word_tokenize(string_text)
pos_tags = nltk.pos_tag(tokens)
pos_string = "{'JJR', 'VB', 'WP', 'WRB', 'NNS', 'JJS', 'JJ', 'RB', 'MD', 'VBZ', 'VBG', 'VBP'}"
words = re.findall(r"'(\w+)'", pos_string)
string_list = [token for token, tag in pos_tags if tag in words]
if string_list:
string_list = ' '.join(string_list)
return string_list
return None
# complex pre-processing data
def pre_processing_data_2(string_text):
string_text = string_text.lower()
string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
string_output = contractions.fix(string_output)
string_processed = get_main_words(string_output)
if string_processed:
tokenizer = RegexpTokenizer(r'\w+')
string_processed = tokenizer.tokenize(string_processed)
string_processed = " ".join(string_processed)
return string_processed
tokenizer = RegexpTokenizer(r'\w+')
string_output = tokenizer.tokenize(string_output)
string_output = [w for w in string_output if not w in stop_words]
string_output = " ".join(string_output)
return string_output
def preprocessing_data(string_text):
string_text = string_text.lower()
string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
string_output = contractions.fix(string_output)
tokenizer = RegexpTokenizer(r'\w+')
string_output = tokenizer.tokenize(string_output)
string_output = [w for w in string_output if not w in stop_words]
string_output = " ".join(string_output)
return string_output