File size: 2,384 Bytes
0857e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re

import nltk as nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("./labeled_data.csv")
print("Finished loading data from labeled_data.csv")

# Data cleansing
tweets = df.iloc[:,6]
texts = []
for iterrow in tweets.items():
  text = iterrow[1]
  text = re.sub(r'\@.*\:', "",text)
  text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE)
  text = re.sub(r'[^A-Za-z ]+', "",text)
  text = re.sub(r'RT', "",text)
  texts.append(text)

df_1 = df.iloc[:,:6]
df_2 = pd.DataFrame(texts)
print(df_2)
count = CountVectorizer()
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
count.fit(df_2[0])
X_train_vectorizer=count.transform(df_2[0])
df_2 = pd.DataFrame(X_train_vectorizer.toarray())
df_cleaned = pd.concat([df_1,df_2],axis=1)

# Data splitting
def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df_local.index)
    m = len(df_local.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df_local.iloc[perm[:train_end]]
    validate = df_local.iloc[perm[train_end:validate_end]]
    test = df_local.iloc[perm[validate_end:]]
    return train, validate, test

train, validate, test = train_validate_test_split(df_cleaned)
train = train.dropna(axis=0).reset_index(drop=True)
validate = validate.dropna(axis=0).reset_index(drop=True)
test = test.dropna(axis=0).reset_index(drop=True)

# Construct a dictionary
# 1. Traverse each word in the dataset, store them in a dictionary
#   the dictionary will be used for one-hot encoding
# 2. Calculate the maximum number of words that a sentense contains
train_tweets = train.iloc[:,6]
word_set = set()

max_len = 0
curr_len = 0
for line in train_tweets.items():
  if curr_len > max_len:
    max_len = curr_len
  curr_len = 0
  for word in line[1].split():
    word_set.add(word)
    curr_len += 1

dictionary = list(word_set)
# max_len: 33
# len(dictionary):


# # Load the word2vec model
# model = Word2Vec.load("word2vec.model")
#
# # Convert the text to a list of words
# words = nltk.word_tokenize(text)
#
# # Convert the words to word vectors using the word2vec model
# vectors = [model.wv[word] for word in words]