import csv import numpy as np import nltk words_to_nums = {} def get_data_for_training(filename): raw_data = open(filename, 'rt') reader = csv.reader(raw_data, delimiter=',') return list(reader) def get_data_and_labels(raw_data): labels = np.array(raw_data) labels = np.delete(labels, (0), axis=0) labels = np.delete(labels, (0), axis=1) labels = labels[:, 0] for i, label in enumerate(labels): labels[i] = 1 if (label == 'positive') else 0 labels = np.array(labels).astype('int') del raw_data[0] for j in raw_data: del j[0] del j[0] for i in range(len(raw_data)): raw_data[i] = nltk.word_tokenize(raw_data[i][0]) return raw_data, labels def get_word_embeddings(sentences): counter = 0 data = [] for words in sentences: num = [] for word in words: if word not in words_to_nums: words_to_nums[word] = counter num.append(counter) counter = counter+1 else: num.append(words_to_nums[word]) data.append(num) data = np.array(data, dtype=object) return data def vectorize_sequence(sequences, dimensions): results = np.zeros((len(sequences), dimensions)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results def get_sequence(text): text_input = nltk.word_tokenize(text) sequence = [] for word in text_input: if word not in words_to_nums: continue else: sequence.append(words_to_nums[word]) testdata = [] testdata.append(sequence) sequence = np.array(testdata) return sequence