Spaces:

zhanyil2
/

sentiment

No application file

App Files Files Community

sentiment / process_data.py

zhanyil2

Upload 12 files

e5a4e3d over 2 years ago

raw

history blame contribute delete

2.38 kB

	import re

	import nltk as nltk
	import numpy as np
	import pandas as pd
	from gensim.models import Word2Vec
	from sklearn.feature_extraction.text import CountVectorizer

	df = pd.read_csv("./labeled_data.csv")
	print("Finished loading data from labeled_data.csv")

	# Data cleansing
	tweets = df.iloc[:,6]
	texts = []
	for iterrow in tweets.items():
	text = iterrow[1]
	text = re.sub(r'\@.*\:', "",text)
	text = re.sub(r'(https\|http)?:\/\/(\w\|\.\|\/\|\?\|\=\|\&\|\%)*\b', "", text, flags=re.MULTILINE)
	text = re.sub(r'[^A-Za-z ]+', "",text)
	text = re.sub(r'RT', "",text)
	texts.append(text)

	df_1 = df.iloc[:,:6]
	df_2 = pd.DataFrame(texts)
	print(df_2)
	count = CountVectorizer()
	count = CountVectorizer(stop_words='english', ngram_range=(1,5))
	count.fit(df_2[0])
	X_train_vectorizer=count.transform(df_2[0])
	df_2 = pd.DataFrame(X_train_vectorizer.toarray())
	df_cleaned = pd.concat([df_1,df_2],axis=1)

	# Data splitting
	def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
	np.random.seed(seed)
	perm = np.random.permutation(df_local.index)
	m = len(df_local.index)
	train_end = int(train_percent * m)
	validate_end = int(validate_percent * m) + train_end
	train = df_local.iloc[perm[:train_end]]
	validate = df_local.iloc[perm[train_end:validate_end]]
	test = df_local.iloc[perm[validate_end:]]
	return train, validate, test

	train, validate, test = train_validate_test_split(df_cleaned)
	train = train.dropna(axis=0).reset_index(drop=True)
	validate = validate.dropna(axis=0).reset_index(drop=True)
	test = test.dropna(axis=0).reset_index(drop=True)

	# Construct a dictionary
	# 1. Traverse each word in the dataset, store them in a dictionary
	# the dictionary will be used for one-hot encoding
	# 2. Calculate the maximum number of words that a sentense contains
	train_tweets = train.iloc[:,6]
	word_set = set()

	max_len = 0
	curr_len = 0
	for line in train_tweets.items():
	if curr_len > max_len:
	max_len = curr_len
	curr_len = 0
	for word in line[1].split():
	word_set.add(word)
	curr_len += 1

	dictionary = list(word_set)
	# max_len: 33
	# len(dictionary):


	# # Load the word2vec model
	# model = Word2Vec.load("word2vec.model")
	#
	# # Convert the text to a list of words
	# words = nltk.word_tokenize(text)
	#
	# # Convert the words to word vectors using the word2vec model
	# vectors = [model.wv[word] for word in words]