File size: 1,710 Bytes
5502197 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import csv
import numpy as np
import nltk
words_to_nums = {}
def get_data_for_training(filename):
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',')
return list(reader)
def get_data_and_labels(raw_data):
labels = np.array(raw_data)
labels = np.delete(labels, (0), axis=0)
labels = np.delete(labels, (0), axis=1)
labels = labels[:, 0]
for i, label in enumerate(labels):
labels[i] = 1 if (label == 'positive') else 0
labels = np.array(labels).astype('int')
del raw_data[0]
for j in raw_data:
del j[0]
del j[0]
for i in range(len(raw_data)):
raw_data[i] = nltk.word_tokenize(raw_data[i][0])
return raw_data, labels
def get_word_embeddings(sentences):
counter = 0
data = []
for words in sentences:
num = []
for word in words:
if word not in words_to_nums:
words_to_nums[word] = counter
num.append(counter)
counter = counter+1
else:
num.append(words_to_nums[word])
data.append(num)
data = np.array(data, dtype=object)
return data
def vectorize_sequence(sequences, dimensions):
results = np.zeros((len(sequences), dimensions))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
def get_sequence(text):
text_input = nltk.word_tokenize(text)
sequence = []
for word in text_input:
if word not in words_to_nums:
continue
else:
sequence.append(words_to_nums[word])
testdata = []
testdata.append(sequence)
sequence = np.array(testdata)
return sequence |