|
import re |
|
|
|
import nltk as nltk |
|
import numpy as np |
|
import pandas as pd |
|
from gensim.models import Word2Vec |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
df = pd.read_csv("./labeled_data.csv") |
|
print("Finished loading data from labeled_data.csv") |
|
|
|
|
|
tweets = df.iloc[:,6] |
|
texts = [] |
|
for iterrow in tweets.items(): |
|
text = iterrow[1] |
|
text = re.sub(r'\@.*\:', "",text) |
|
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE) |
|
text = re.sub(r'[^A-Za-z ]+', "",text) |
|
text = re.sub(r'RT', "",text) |
|
texts.append(text) |
|
|
|
df_1 = df.iloc[:,:6] |
|
df_2 = pd.DataFrame(texts) |
|
print(df_2) |
|
count = CountVectorizer() |
|
count = CountVectorizer(stop_words='english', ngram_range=(1,5)) |
|
count.fit(df_2[0]) |
|
X_train_vectorizer=count.transform(df_2[0]) |
|
df_2 = pd.DataFrame(X_train_vectorizer.toarray()) |
|
df_cleaned = pd.concat([df_1,df_2],axis=1) |
|
|
|
|
|
def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None): |
|
np.random.seed(seed) |
|
perm = np.random.permutation(df_local.index) |
|
m = len(df_local.index) |
|
train_end = int(train_percent * m) |
|
validate_end = int(validate_percent * m) + train_end |
|
train = df_local.iloc[perm[:train_end]] |
|
validate = df_local.iloc[perm[train_end:validate_end]] |
|
test = df_local.iloc[perm[validate_end:]] |
|
return train, validate, test |
|
|
|
train, validate, test = train_validate_test_split(df_cleaned) |
|
train = train.dropna(axis=0).reset_index(drop=True) |
|
validate = validate.dropna(axis=0).reset_index(drop=True) |
|
test = test.dropna(axis=0).reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
train_tweets = train.iloc[:,6] |
|
word_set = set() |
|
|
|
max_len = 0 |
|
curr_len = 0 |
|
for line in train_tweets.items(): |
|
if curr_len > max_len: |
|
max_len = curr_len |
|
curr_len = 0 |
|
for word in line[1].split(): |
|
word_set.add(word) |
|
curr_len += 1 |
|
|
|
dictionary = list(word_set) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|