Spaces:
Build error
Build error
import re | |
import nltk as nltk | |
import numpy as np | |
import pandas as pd | |
from gensim.models import Word2Vec | |
from sklearn.feature_extraction.text import CountVectorizer | |
df = pd.read_csv("./labeled_data.csv") | |
print("Finished loading data from labeled_data.csv") | |
# Data cleansing | |
tweets = df.iloc[:,6] | |
texts = [] | |
for iterrow in tweets.items(): | |
text = iterrow[1] | |
text = re.sub(r'\@.*\:', "",text) | |
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE) | |
text = re.sub(r'[^A-Za-z ]+', "",text) | |
text = re.sub(r'RT', "",text) | |
texts.append(text) | |
df_1 = df.iloc[:,:6] | |
df_2 = pd.DataFrame(texts) | |
print(df_2) | |
count = CountVectorizer() | |
count = CountVectorizer(stop_words='english', ngram_range=(1,5)) | |
count.fit(df_2[0]) | |
X_train_vectorizer=count.transform(df_2[0]) | |
df_2 = pd.DataFrame(X_train_vectorizer.toarray()) | |
df_cleaned = pd.concat([df_1,df_2],axis=1) | |
# Data splitting | |
def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None): | |
np.random.seed(seed) | |
perm = np.random.permutation(df_local.index) | |
m = len(df_local.index) | |
train_end = int(train_percent * m) | |
validate_end = int(validate_percent * m) + train_end | |
train = df_local.iloc[perm[:train_end]] | |
validate = df_local.iloc[perm[train_end:validate_end]] | |
test = df_local.iloc[perm[validate_end:]] | |
return train, validate, test | |
train, validate, test = train_validate_test_split(df_cleaned) | |
train = train.dropna(axis=0).reset_index(drop=True) | |
validate = validate.dropna(axis=0).reset_index(drop=True) | |
test = test.dropna(axis=0).reset_index(drop=True) | |
# Construct a dictionary | |
# 1. Traverse each word in the dataset, store them in a dictionary | |
# the dictionary will be used for one-hot encoding | |
# 2. Calculate the maximum number of words that a sentense contains | |
train_tweets = train.iloc[:,6] | |
word_set = set() | |
max_len = 0 | |
curr_len = 0 | |
for line in train_tweets.items(): | |
if curr_len > max_len: | |
max_len = curr_len | |
curr_len = 0 | |
for word in line[1].split(): | |
word_set.add(word) | |
curr_len += 1 | |
dictionary = list(word_set) | |
# max_len: 33 | |
# len(dictionary): | |
# # Load the word2vec model | |
# model = Word2Vec.load("word2vec.model") | |
# | |
# # Convert the text to a list of words | |
# words = nltk.word_tokenize(text) | |
# | |
# # Convert the words to word vectors using the word2vec model | |
# vectors = [model.wv[word] for word in words] | |