Spaces:
Build error
Build error
File size: 2,384 Bytes
0857e86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import re
import nltk as nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("./labeled_data.csv")
print("Finished loading data from labeled_data.csv")
# Data cleansing
tweets = df.iloc[:,6]
texts = []
for iterrow in tweets.items():
text = iterrow[1]
text = re.sub(r'\@.*\:', "",text)
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE)
text = re.sub(r'[^A-Za-z ]+', "",text)
text = re.sub(r'RT', "",text)
texts.append(text)
df_1 = df.iloc[:,:6]
df_2 = pd.DataFrame(texts)
print(df_2)
count = CountVectorizer()
count = CountVectorizer(stop_words='english', ngram_range=(1,5))
count.fit(df_2[0])
X_train_vectorizer=count.transform(df_2[0])
df_2 = pd.DataFrame(X_train_vectorizer.toarray())
df_cleaned = pd.concat([df_1,df_2],axis=1)
# Data splitting
def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
np.random.seed(seed)
perm = np.random.permutation(df_local.index)
m = len(df_local.index)
train_end = int(train_percent * m)
validate_end = int(validate_percent * m) + train_end
train = df_local.iloc[perm[:train_end]]
validate = df_local.iloc[perm[train_end:validate_end]]
test = df_local.iloc[perm[validate_end:]]
return train, validate, test
train, validate, test = train_validate_test_split(df_cleaned)
train = train.dropna(axis=0).reset_index(drop=True)
validate = validate.dropna(axis=0).reset_index(drop=True)
test = test.dropna(axis=0).reset_index(drop=True)
# Construct a dictionary
# 1. Traverse each word in the dataset, store them in a dictionary
# the dictionary will be used for one-hot encoding
# 2. Calculate the maximum number of words that a sentense contains
train_tweets = train.iloc[:,6]
word_set = set()
max_len = 0
curr_len = 0
for line in train_tweets.items():
if curr_len > max_len:
max_len = curr_len
curr_len = 0
for word in line[1].split():
word_set.add(word)
curr_len += 1
dictionary = list(word_set)
# max_len: 33
# len(dictionary):
# # Load the word2vec model
# model = Word2Vec.load("word2vec.model")
#
# # Convert the text to a list of words
# words = nltk.word_tokenize(text)
#
# # Convert the words to word vectors using the word2vec model
# vectors = [model.wv[word] for word in words]
|