sentiment / process_data.py
zhanyil2's picture
Upload 12 files
e5a4e3d
raw
history blame contribute delete
No virus
2.38 kB
import re
import nltk as nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("./labeled_data.csv")
print("Finished loading data from labeled_data.csv")
# Data cleansing
tweets = df.iloc[:,6]
texts = []
for iterrow in tweets.items():
text = iterrow[1]
text = re.sub(r'\@.*\:', "",text)
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE)
text = re.sub(r'[^A-Za-z ]+', "",text)
text = re.sub(r'RT', "",text)
texts.append(text)
df_1 = df.iloc[:,:6]
df_2 = pd.DataFrame(texts)
print(df_2)
count = CountVectorizer()
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
count.fit(df_2[0])
X_train_vectorizer=count.transform(df_2[0])
df_2 = pd.DataFrame(X_train_vectorizer.toarray())
df_cleaned = pd.concat([df_1,df_2],axis=1)
# Data splitting
def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
np.random.seed(seed)
perm = np.random.permutation(df_local.index)
m = len(df_local.index)
train_end = int(train_percent * m)
validate_end = int(validate_percent * m) + train_end
train = df_local.iloc[perm[:train_end]]
validate = df_local.iloc[perm[train_end:validate_end]]
test = df_local.iloc[perm[validate_end:]]
return train, validate, test
train, validate, test = train_validate_test_split(df_cleaned)
train = train.dropna(axis=0).reset_index(drop=True)
validate = validate.dropna(axis=0).reset_index(drop=True)
test = test.dropna(axis=0).reset_index(drop=True)
# Construct a dictionary
# 1. Traverse each word in the dataset, store them in a dictionary
# the dictionary will be used for one-hot encoding
# 2. Calculate the maximum number of words that a sentense contains
train_tweets = train.iloc[:,6]
word_set = set()
max_len = 0
curr_len = 0
for line in train_tweets.items():
if curr_len > max_len:
max_len = curr_len
curr_len = 0
for word in line[1].split():
word_set.add(word)
curr_len += 1
dictionary = list(word_set)
# max_len: 33
# len(dictionary):
# # Load the word2vec model
# model = Word2Vec.load("word2vec.model")
#
# # Convert the text to a list of words
# words = nltk.word_tokenize(text)
#
# # Convert the words to word vectors using the word2vec model
# vectors = [model.wv[word] for word in words]