chap0lin commited on
Commit
f91b404
1 Parent(s): 7552917

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -82
app.py CHANGED
@@ -26,116 +26,116 @@ import en_core_web_sm
26
  nlp = en_core_web_sm.load()
27
 
28
 
29
- # def recall_m(y_true, y_pred):
30
- # true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
31
- # possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
32
- # recall = true_positives / (possible_positives + K.epsilon())
33
- # return recall
34
 
35
- # def precision_m(y_true, y_pred):
36
- # true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
37
- # predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
38
- # precision = true_positives / (predicted_positives + K.epsilon())
39
- # return precision
40
 
41
- # def f1_m(y_true, y_pred):
42
- # precision = precision_m(y_true, y_pred)
43
- # recall = recall_m(y_true, y_pred)
44
- # return 2*((precision*recall)/(precision+recall+K.epsilon()))
45
 
46
 
47
- # #initialise callback class
48
- # class callback(CallbackAny2Vec):
49
- # """
50
- # Print the loss value after each epoch
51
- # """
52
- # def __init__(self):
53
- # self.epoch = 0
54
- # #gensim loss is cumulative, so we record previous values to print
55
- # self.loss_previous_step = 0
56
 
57
- # def on_epoch_end(self, model):
58
- # loss = model.get_latest_training_loss()
59
- # if self.epoch % 100 == 0:
60
- # print('Loss after epoch {}: {}'.format(self.epoch, loss-self.loss_previous_step))
61
 
62
- # self.epoch+= 1
63
- # self.loss_previous_step = loss
64
 
65
 
66
 
67
 
68
 
69
- # def spacy_lemmatize_text(text):
70
- # text = nlp(text)
71
- # text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
72
- # return text
73
 
74
- # def remove_accented_chars(text):
75
- # text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
76
- # return text
77
 
78
- # def remove_special_characters(text, remove_digits=False):
79
- # pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
80
- # text = re.sub(pattern, '', text)
81
- # return text
82
 
83
- # def remove_stopwords(text, is_lower_case=False, stopwords=None):
84
- # if not stopwords:
85
- # stopwords = nltk.corpus.stopwords.words('english')
86
- # tokens = nltk.word_tokenize(text)
87
- # tokens = [token.strip() for token in tokens]
88
 
89
- # if is_lower_case:
90
- # filtered_tokens = [token for token in tokens if token not in stopwords]
91
- # else:
92
- # filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
93
 
94
- # filtered_text = ' '.join(filtered_tokens)
95
- # return filtered_text
96
 
97
 
98
- # def pre_process():
99
- # opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
100
- # sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
101
- # sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
102
- # sentenceLowered = sentenceWithoutPunctuation.lower()
103
- # sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
104
- # sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
105
 
106
- # return nltk.word_tokenize(sentenceLemStopped)
107
 
108
- # def classify(new_column = True):
109
- # sentenceWords = json.loads(sentence.replace("'",'"'))
110
 
111
- # aux_vector = []
112
- # for word in sentenceWords:
113
- # aux_vector.append(reloaded_w2v_model.wv[word])
114
- # w2vWords = []
115
- # w2vWords.append(aux_vector)
116
- # MCTIinput_vector = pad_sequences(w2vWords, maxlen=2726, padding='pre')
117
 
118
- # value = reconstructed_model_CNN.predict(MCTIinput_vector)[0]
119
 
120
- # if value >= 0.5:
121
- # return Image.open(r"elegivel.png")
122
- # else:
123
- # return Image.open(r"inelegivel.png")
124
 
125
- # dataMCTI['opo_pre_tkn'] = sentencesMCTIList_xp8
126
- # dataMCTI['opo_pre'] = sentencesMCTIList_xp8_sentences
127
 
128
- # def gen_output(data):
129
- # data.to_excel("output.xlsx", index=False)
130
- # return "output.xlsx"
131
 
132
 
133
- # reloaded_w2v_model = Word2Vec.load('word2vec_xp8.model')
134
 
135
- # reconstructed_model_CNN = keras.models.load_model("best weights CNN.h5",
136
- # custom_objects={'f1_m':f1_m,
137
- # "precision_m":precision_m,
138
- # "recall_m":recall_m})
139
 
140
  def app(operacao, resultado, dados):
141
  data = pd.read_excel(dados)
 
26
  nlp = en_core_web_sm.load()
27
 
28
 
29
+ def recall_m(y_true, y_pred):
30
+ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
31
+ possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
32
+ recall = true_positives / (possible_positives + K.epsilon())
33
+ return recall
34
 
35
+ def precision_m(y_true, y_pred):
36
+ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
37
+ predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
38
+ precision = true_positives / (predicted_positives + K.epsilon())
39
+ return precision
40
 
41
+ def f1_m(y_true, y_pred):
42
+ precision = precision_m(y_true, y_pred)
43
+ recall = recall_m(y_true, y_pred)
44
+ return 2*((precision*recall)/(precision+recall+K.epsilon()))
45
 
46
 
47
+ #initialise callback class
48
+ class callback(CallbackAny2Vec):
49
+ """
50
+ Print the loss value after each epoch
51
+ """
52
+ def __init__(self):
53
+ self.epoch = 0
54
+ #gensim loss is cumulative, so we record previous values to print
55
+ self.loss_previous_step = 0
56
 
57
+ def on_epoch_end(self, model):
58
+ loss = model.get_latest_training_loss()
59
+ if self.epoch % 100 == 0:
60
+ print('Loss after epoch {}: {}'.format(self.epoch, loss-self.loss_previous_step))
61
 
62
+ self.epoch+= 1
63
+ self.loss_previous_step = loss
64
 
65
 
66
 
67
 
68
 
69
+ def spacy_lemmatize_text(text):
70
+ text = nlp(text)
71
+ text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
72
+ return text
73
 
74
+ def remove_accented_chars(text):
75
+ text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
76
+ return text
77
 
78
+ def remove_special_characters(text, remove_digits=False):
79
+ pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
80
+ text = re.sub(pattern, '', text)
81
+ return text
82
 
83
+ def remove_stopwords(text, is_lower_case=False, stopwords=None):
84
+ if not stopwords:
85
+ stopwords = nltk.corpus.stopwords.words('english')
86
+ tokens = nltk.word_tokenize(text)
87
+ tokens = [token.strip() for token in tokens]
88
 
89
+ if is_lower_case:
90
+ filtered_tokens = [token for token in tokens if token not in stopwords]
91
+ else:
92
+ filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
93
 
94
+ filtered_text = ' '.join(filtered_tokens)
95
+ return filtered_text
96
 
97
 
98
+ def pre_process():
99
+ opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
100
+ sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
101
+ sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
102
+ sentenceLowered = sentenceWithoutPunctuation.lower()
103
+ sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
104
+ sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
105
 
106
+ return nltk.word_tokenize(sentenceLemStopped)
107
 
108
+ def classify(new_column = True):
109
+ sentenceWords = json.loads(sentence.replace("'",'"'))
110
 
111
+ aux_vector = []
112
+ for word in sentenceWords:
113
+ aux_vector.append(reloaded_w2v_model.wv[word])
114
+ w2vWords = []
115
+ w2vWords.append(aux_vector)
116
+ MCTIinput_vector = pad_sequences(w2vWords, maxlen=2726, padding='pre')
117
 
118
+ value = reconstructed_model_CNN.predict(MCTIinput_vector)[0]
119
 
120
+ # if value >= 0.5:
121
+ # return Image.open(r"elegivel.png")
122
+ # else:
123
+ # return Image.open(r"inelegivel.png")
124
 
125
+ dataMCTI['opo_pre_tkn'] = sentencesMCTIList_xp8
126
+ dataMCTI['opo_pre'] = sentencesMCTIList_xp8_sentences
127
 
128
+ def gen_output(data):
129
+ data.to_excel("output.xlsx", index=False)
130
+ return "output.xlsx"
131
 
132
 
133
+ reloaded_w2v_model = Word2Vec.load('word2vec_xp8.model')
134
 
135
+ reconstructed_model_CNN = keras.models.load_model("best weights CNN.h5",
136
+ custom_objects={'f1_m':f1_m,
137
+ "precision_m":precision_m,
138
+ "recall_m":recall_m})
139
 
140
  def app(operacao, resultado, dados):
141
  data = pd.read_excel(dados)