Upload utilities.py
Browse files- utilities.py +65 -0
utilities.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import numpy as np
|
3 |
+
import nltk
|
4 |
+
|
5 |
+
words_to_nums = {}
|
6 |
+
|
7 |
+
def get_data_for_training(filename):
|
8 |
+
raw_data = open(filename, 'rt')
|
9 |
+
reader = csv.reader(raw_data, delimiter=',')
|
10 |
+
return list(reader)
|
11 |
+
|
12 |
+
|
13 |
+
def get_data_and_labels(raw_data):
|
14 |
+
labels = np.array(raw_data)
|
15 |
+
labels = np.delete(labels, (0), axis=0)
|
16 |
+
labels = np.delete(labels, (0), axis=1)
|
17 |
+
labels = labels[:, 0]
|
18 |
+
for i, label in enumerate(labels):
|
19 |
+
labels[i] = 1 if (label == 'positive') else 0
|
20 |
+
labels = np.array(labels).astype('int')
|
21 |
+
del raw_data[0]
|
22 |
+
for j in raw_data:
|
23 |
+
del j[0]
|
24 |
+
del j[0]
|
25 |
+
for i in range(len(raw_data)):
|
26 |
+
raw_data[i] = nltk.word_tokenize(raw_data[i][0])
|
27 |
+
return raw_data, labels
|
28 |
+
|
29 |
+
|
30 |
+
def get_word_embeddings(sentences):
|
31 |
+
counter = 0
|
32 |
+
data = []
|
33 |
+
for words in sentences:
|
34 |
+
num = []
|
35 |
+
for word in words:
|
36 |
+
if word not in words_to_nums:
|
37 |
+
words_to_nums[word] = counter
|
38 |
+
num.append(counter)
|
39 |
+
counter = counter+1
|
40 |
+
else:
|
41 |
+
num.append(words_to_nums[word])
|
42 |
+
data.append(num)
|
43 |
+
|
44 |
+
data = np.array(data, dtype=object)
|
45 |
+
return data
|
46 |
+
|
47 |
+
|
48 |
+
def vectorize_sequence(sequences, dimensions):
|
49 |
+
results = np.zeros((len(sequences), dimensions))
|
50 |
+
for i, sequence in enumerate(sequences):
|
51 |
+
results[i, sequence] = 1.
|
52 |
+
return results
|
53 |
+
|
54 |
+
def get_sequence(text):
|
55 |
+
text_input = nltk.word_tokenize(text)
|
56 |
+
sequence = []
|
57 |
+
for word in text_input:
|
58 |
+
if word not in words_to_nums:
|
59 |
+
continue
|
60 |
+
else:
|
61 |
+
sequence.append(words_to_nums[word])
|
62 |
+
testdata = []
|
63 |
+
testdata.append(sequence)
|
64 |
+
sequence = np.array(testdata)
|
65 |
+
return sequence
|