Spaces:

zhanyil2
/

sentiment2

Build error

App Files Files Community

sentiment2 / process_data.py

zhanyil2

Upload 12 files

0857e86 over 1 year ago

raw

history blame contribute delete

No virus

2.38 kB

	import re

	import nltk as nltk
	import numpy as np
	import pandas as pd
	from gensim.models import Word2Vec
	from sklearn.feature_extraction.text import CountVectorizer

	df = pd.read_csv("./labeled_data.csv")
	print("Finished loading data from labeled_data.csv")

	# Data cleansing
	tweets = df.iloc[:,6]
	texts = []
	for iterrow in tweets.items():
	text = iterrow[1]
	text = re.sub(r'\@.*\:', "",text)
	text = re.sub(r'(https\|http)?:\/\/(\w\|\.\|\/\|\?\|\=\|\&\|\%)*\b', "", text, flags=re.MULTILINE)
	text = re.sub(r'[^A-Za-z ]+', "",text)
	text = re.sub(r'RT', "",text)
	texts.append(text)

	df_1 = df.iloc[:,:6]
	df_2 = pd.DataFrame(texts)
	print(df_2)
	count = CountVectorizer()
	count = CountVectorizer(stop_words='english', ngram_range=(1,5))
	count.fit(df_2[0])
	X_train_vectorizer=count.transform(df_2[0])
	df_2 = pd.DataFrame(X_train_vectorizer.toarray())
	df_cleaned = pd.concat([df_1,df_2],axis=1)

	# Data splitting
	def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None):
	np.random.seed(seed)
	perm = np.random.permutation(df_local.index)
	m = len(df_local.index)
	train_end = int(train_percent * m)
	validate_end = int(validate_percent * m) + train_end
	train = df_local.iloc[perm[:train_end]]
	validate = df_local.iloc[perm[train_end:validate_end]]
	test = df_local.iloc[perm[validate_end:]]
	return train, validate, test

	train, validate, test = train_validate_test_split(df_cleaned)
	train = train.dropna(axis=0).reset_index(drop=True)
	validate = validate.dropna(axis=0).reset_index(drop=True)
	test = test.dropna(axis=0).reset_index(drop=True)

	# Construct a dictionary
	# 1. Traverse each word in the dataset, store them in a dictionary
	# the dictionary will be used for one-hot encoding
	# 2. Calculate the maximum number of words that a sentense contains
	train_tweets = train.iloc[:,6]
	word_set = set()

	max_len = 0
	curr_len = 0
	for line in train_tweets.items():
	if curr_len > max_len:
	max_len = curr_len
	curr_len = 0
	for word in line[1].split():
	word_set.add(word)
	curr_len += 1

	dictionary = list(word_set)
	# max_len: 33
	# len(dictionary):


	# # Load the word2vec model
	# model = Word2Vec.load("word2vec.model")
	#
	# # Convert the text to a list of words
	# words = nltk.word_tokenize(text)
	#
	# # Convert the words to word vectors using the word2vec model
	# vectors = [model.wv[word] for word in words]