davidlee1102 commited on
Commit
d35f33e
·
1 Parent(s): 7bef4b5

First commit

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ model/nlp_surrey_coursework_hunglenhat/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
constance_data.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ emotion_track_list = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
2
+ 'desire',
3
+ 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude',
4
+ 'grief', 'joy', 'love',
5
+ 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
6
+ 'neutral']
7
+
8
+ decode_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
9
+
10
+ decode_cut_list = [4, 6, 8, 10, 14, 15, 17, 18, 20, 21, 22, 25, 26, 27]
11
+ decode_cut_transformed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
emotion_model.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ import tensorflow_addons as tfa
4
+
5
+ from constance_data import emotion_track_list, decode_cut_list
6
+ from pre_processing_data import preprocessing_data, pre_processing_data_2, text_transform
7
+
8
+
9
+ def emotion_predict(sentence: str):
10
+ lr = 1e-3
11
+ wd = 1e-4 * lr
12
+ model = tf.keras.models.load_model("model/nlp_surrey_coursework_hunglenhat")
13
+ model.compile(loss='sparse_categorical_crossentropy',
14
+ optimizer=tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd), metrics=['accuracy'])
15
+ sentence = pre_processing_data_2(sentence)
16
+ if not sentence:
17
+ sentence = preprocessing_data(sentence)
18
+
19
+ sentence = text_transform(sentence)
20
+ try:
21
+ sentence = model.predict(sentence)
22
+ except Exception as E:
23
+ print(E)
24
+ index_max = np.argmax(sentence)
25
+ result = emotion_track_list[decode_cut_list[index_max]]
26
+ return result
model/nlp_surrey_coursework_hunglenhat/fingerprint.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4f4c2e72bf0f471efafd4a4f43d81e20d41f1270ffe840f7dad866e71a223e0
3
+ size 53
model/nlp_surrey_coursework_hunglenhat/keras_metadata.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45149d48f67806c16c962cfa8db094098ac9e49baf9001e1dc105d7cca8a5384
3
+ size 15056
model/nlp_surrey_coursework_hunglenhat/saved_model.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48ebc2bab3ee1e455e5ca7dbd40c5b9887d425ec000a75b256b54b8ad5d7396
3
+ size 801857
model/nlp_surrey_coursework_hunglenhat/variables/variables.index ADDED
Binary file (2.38 kB). View file
 
model/tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2b3fb1cc1be8d44ba122aa41103dcbee6c6137bd5f4f5f8e7ceecea0d00839
3
+ size 241110
pre_processing_data.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contractions
2
+ import spacy
3
+ import nltk
4
+ import pickle
5
+
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import RegexpTokenizer
8
+ from keras_preprocessing.sequence import pad_sequences
9
+
10
+ nltk.download('punkt')
11
+ nltk.download('wordnet')
12
+ nltk.download('omw-1.4')
13
+ nltk.download('stopwords')
14
+ nltk.download('averaged_perceptron_tagger')
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ stop_words = set(stopwords.words('english'))
18
+
19
+
20
+ def text_transform(string_text):
21
+ with open('model/tokenizer.pickle', 'rb') as handle:
22
+ loaded_tokenizer = pickle.load(handle)
23
+ string_text_list = []
24
+ string_text_list.append(string_text)
25
+ sequences = loaded_tokenizer.texts_to_sequences(string_text_list)
26
+ padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')
27
+ return padded_sequences
28
+
29
+
30
+ # python -m spacy download en_core_web_sm
31
+ # pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
32
+ import re
33
+
34
+
35
+ # pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
36
+ def get_main_words(string_text):
37
+ tokens = nltk.word_tokenize(string_text)
38
+ pos_tags = nltk.pos_tag(tokens)
39
+
40
+ pos_string = "{'JJR', 'VB', 'WP', 'WRB', 'NNS', 'JJS', 'JJ', 'RB', 'MD', 'VBZ', 'VBG', 'VBP'}"
41
+ words = re.findall(r"'(\w+)'", pos_string)
42
+
43
+ string_list = [token for token, tag in pos_tags if tag in words]
44
+
45
+ if string_list:
46
+ string_list = ' '.join(string_list)
47
+ return string_list
48
+ return None
49
+
50
+
51
+ # complex pre-processing data
52
+ def pre_processing_data_2(string_text):
53
+ string_text = string_text.lower()
54
+ string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
55
+ string_output = contractions.fix(string_output)
56
+
57
+ string_processed = get_main_words(string_output)
58
+ if string_processed:
59
+ tokenizer = RegexpTokenizer(r'\w+')
60
+ string_processed = tokenizer.tokenize(string_processed)
61
+ string_processed = " ".join(string_processed)
62
+ return string_processed
63
+
64
+ tokenizer = RegexpTokenizer(r'\w+')
65
+ string_output = tokenizer.tokenize(string_output)
66
+ string_output = [w for w in string_output if not w in stop_words]
67
+ string_output = " ".join(string_output)
68
+ return string_output
69
+
70
+
71
+ def preprocessing_data(string_text):
72
+ string_text = string_text.lower()
73
+ string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
74
+ string_output = contractions.fix(string_output)
75
+
76
+ tokenizer = RegexpTokenizer(r'\w+')
77
+ string_output = tokenizer.tokenize(string_output)
78
+ string_output = [w for w in string_output if not w in stop_words]
79
+ string_output = " ".join(string_output)
80
+ return string_output