File size: 1,710 Bytes
5502197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import csv
import numpy as np
import nltk

words_to_nums = {}

def get_data_for_training(filename):
    raw_data = open(filename, 'rt')
    reader = csv.reader(raw_data, delimiter=',')
    return list(reader)


def get_data_and_labels(raw_data):
    labels = np.array(raw_data)
    labels = np.delete(labels, (0), axis=0)
    labels = np.delete(labels, (0), axis=1)
    labels = labels[:, 0]
    for i, label in enumerate(labels):
        labels[i] = 1 if (label == 'positive') else 0
    labels = np.array(labels).astype('int')
    del raw_data[0]
    for j in raw_data:
        del j[0]
        del j[0]
    for i in range(len(raw_data)):
        raw_data[i] = nltk.word_tokenize(raw_data[i][0])
    return raw_data, labels


def get_word_embeddings(sentences):
    counter = 0
    data = []
    for words in sentences:
        num = []
        for word in words:
            if word not in words_to_nums:
                words_to_nums[word] = counter
                num.append(counter)
                counter = counter+1
            else:
                num.append(words_to_nums[word])
        data.append(num)

    data = np.array(data, dtype=object)
    return data


def vectorize_sequence(sequences, dimensions):
    results = np.zeros((len(sequences), dimensions))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

def get_sequence(text):
    text_input = nltk.word_tokenize(text)
    sequence = []
    for word in text_input:
        if word not in words_to_nums:
            continue
        else:
            sequence.append(words_to_nums[word])
    testdata = []
    testdata.append(sequence)
    sequence = np.array(testdata)
    return sequence