import re import nltk as nltk import numpy as np import pandas as pd from gensim.models import Word2Vec from sklearn.feature_extraction.text import CountVectorizer df = pd.read_csv("./labeled_data.csv") print("Finished loading data from labeled_data.csv") # Data cleansing tweets = df.iloc[:,6] texts = [] for iterrow in tweets.items(): text = iterrow[1] text = re.sub(r'\@.*\:', "",text) text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE) text = re.sub(r'[^A-Za-z ]+', "",text) text = re.sub(r'RT', "",text) texts.append(text) df_1 = df.iloc[:,:6] df_2 = pd.DataFrame(texts) print(df_2) count = CountVectorizer() count = CountVectorizer(stop_words='english', ngram_range=(1,5)) count.fit(df_2[0]) X_train_vectorizer=count.transform(df_2[0]) df_2 = pd.DataFrame(X_train_vectorizer.toarray()) df_cleaned = pd.concat([df_1,df_2],axis=1) # Data splitting def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None): np.random.seed(seed) perm = np.random.permutation(df_local.index) m = len(df_local.index) train_end = int(train_percent * m) validate_end = int(validate_percent * m) + train_end train = df_local.iloc[perm[:train_end]] validate = df_local.iloc[perm[train_end:validate_end]] test = df_local.iloc[perm[validate_end:]] return train, validate, test train, validate, test = train_validate_test_split(df_cleaned) train = train.dropna(axis=0).reset_index(drop=True) validate = validate.dropna(axis=0).reset_index(drop=True) test = test.dropna(axis=0).reset_index(drop=True) # Construct a dictionary # 1. Traverse each word in the dataset, store them in a dictionary # the dictionary will be used for one-hot encoding # 2. Calculate the maximum number of words that a sentense contains train_tweets = train.iloc[:,6] word_set = set() max_len = 0 curr_len = 0 for line in train_tweets.items(): if curr_len > max_len: max_len = curr_len curr_len = 0 for word in line[1].split(): word_set.add(word) curr_len += 1 dictionary = list(word_set) # max_len: 33 # len(dictionary): # # Load the word2vec model # model = Word2Vec.load("word2vec.model") # # # Convert the text to a list of words # words = nltk.word_tokenize(text) # # # Convert the words to word vectors using the word2vec model # vectors = [model.wv[word] for word in words]