maynaS commited on
Commit
5502197
1 Parent(s): ec43a13

Upload utilities.py

Browse files
Files changed (1) hide show
  1. utilities.py +65 -0
utilities.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import numpy as np
3
+ import nltk
4
+
5
+ words_to_nums = {}
6
+
7
+ def get_data_for_training(filename):
8
+ raw_data = open(filename, 'rt')
9
+ reader = csv.reader(raw_data, delimiter=',')
10
+ return list(reader)
11
+
12
+
13
+ def get_data_and_labels(raw_data):
14
+ labels = np.array(raw_data)
15
+ labels = np.delete(labels, (0), axis=0)
16
+ labels = np.delete(labels, (0), axis=1)
17
+ labels = labels[:, 0]
18
+ for i, label in enumerate(labels):
19
+ labels[i] = 1 if (label == 'positive') else 0
20
+ labels = np.array(labels).astype('int')
21
+ del raw_data[0]
22
+ for j in raw_data:
23
+ del j[0]
24
+ del j[0]
25
+ for i in range(len(raw_data)):
26
+ raw_data[i] = nltk.word_tokenize(raw_data[i][0])
27
+ return raw_data, labels
28
+
29
+
30
+ def get_word_embeddings(sentences):
31
+ counter = 0
32
+ data = []
33
+ for words in sentences:
34
+ num = []
35
+ for word in words:
36
+ if word not in words_to_nums:
37
+ words_to_nums[word] = counter
38
+ num.append(counter)
39
+ counter = counter+1
40
+ else:
41
+ num.append(words_to_nums[word])
42
+ data.append(num)
43
+
44
+ data = np.array(data, dtype=object)
45
+ return data
46
+
47
+
48
+ def vectorize_sequence(sequences, dimensions):
49
+ results = np.zeros((len(sequences), dimensions))
50
+ for i, sequence in enumerate(sequences):
51
+ results[i, sequence] = 1.
52
+ return results
53
+
54
+ def get_sequence(text):
55
+ text_input = nltk.word_tokenize(text)
56
+ sequence = []
57
+ for word in text_input:
58
+ if word not in words_to_nums:
59
+ continue
60
+ else:
61
+ sequence.append(words_to_nums[word])
62
+ testdata = []
63
+ testdata.append(sequence)
64
+ sequence = np.array(testdata)
65
+ return sequence