rkrstacic commited on
Commit
c193173
1 Parent(s): caa4374

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +431 -0
app.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """GradioAppTest.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1QhxoNhhM_kcaoQOyz5hsNWLcf2m2L225
8
+ """
9
+
10
+ !pip install gradio
11
+ !pip install transformers
12
+
13
+ import gradio as gr
14
+ from transformers import pipeline
15
+
16
+ """## JSON"""
17
+
18
+ # Define the process that the models will be trained for
19
+ trainedProcess = "praksa"
20
+ trainedProcessJSON = "Praksa"
21
+
22
+ json = [
23
+ {
24
+ "name": "Praksa",
25
+ "phases": [
26
+ {
27
+ "name": "Odabir preferencija",
28
+ "alias": ["Prijava prakse", "Odabir zadatka", "Prvi korak"],
29
+ "description": "Odabir preferencija je prvi korak u procesu polaganja prakse. Zahtjeva da student odabere zadatak sa popisa...",
30
+ "duration": "1 mjesec",
31
+ },
32
+ {
33
+ "name": "Ispunjavanje prijavnice",
34
+ "description": "Ispunjavanje prijavnice je drugi korak u procesu polaganja prakse. Student mora ispuniti prijavnicu koja se nalazi na stranici kolegija...",
35
+ "duration": "1 tjedan",
36
+ },
37
+ {
38
+ "name": "Predaja dnevnika prakse",
39
+ "alias": ["Završetak prakse", "Dnevnik"],
40
+ "description": "Predaja dnevnika prakse zadnji je korak u procesu polaganja prakse. S završetkom rada, student predaje dnevnik prakse na stranicu kolegija...",
41
+ "duration": "3 dana",
42
+ },
43
+ ],
44
+ "duration": "2 mjeseca",
45
+ },
46
+ {
47
+ "name": "Izrada završnog rada",
48
+ "phases": [
49
+ {
50
+ "name": "Prijava teme",
51
+ "alias": ["Prvi korak"],
52
+ "description": "Prvi korak u procesu izrade završnog rada je prijava teme. Zahtjeva da student odabere mentora te prijavi temu sa popisa...",
53
+ "duration": "5 dana",
54
+ },
55
+ {
56
+ "name": "Ispuna obrasca",
57
+ "description": "Student ispunjava obrazac sa prijavljenom temom...",
58
+ "duration": "4 dana",
59
+ },
60
+ {
61
+ "name": "Obrana rada",
62
+ "description": "Student brani svoj rad pred komosijom...",
63
+ "duration": "1 sat",
64
+ },
65
+ ],
66
+ "duration": "3 mjeseca",
67
+ },
68
+ ]
69
+
70
+ # If tasks do not contain alias propery, assign an empty one to them
71
+ for process in json:
72
+ for task in process["phases"]:
73
+ if "alias" not in task:
74
+ task["alias"] = []
75
+
76
+ """## User intent recognition model
77
+
78
+ CPU ~6m
79
+
80
+ GPU ~3m
81
+ """
82
+
83
+ # Define training epochs
84
+ training_epochs = 10
85
+ label_size = 6
86
+
87
+ # Define dataset URL for training
88
+ UIDatasetURL = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSPR-FPTMBcYRynP4JdwYQQ8dAhSx1x8i1LPckUcuIUUlrWT82b5Thqb1bBNnPeGJPxxX1CJAlFSd6F/pub?output=xlsx'
89
+
90
+ # Will require runetime restart on Google colab (sometimes, idk)
91
+ !pip install tensorflow_text
92
+
93
+ !pip install text-hr
94
+
95
+ """### Data loading"""
96
+
97
+ import tensorflow as tf
98
+ import tensorflow_text as tft
99
+ import tensorflow_hub as tfh
100
+ import pandas as pd
101
+ import numpy as np
102
+ import seaborn as sns
103
+ import matplotlib.pyplot as plt
104
+
105
+ # Text preprocessor for bert based models
106
+ preprocessor = tfh.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2')
107
+
108
+ # Language Agnostic BERT sentence encoder
109
+ model = tfh.KerasLayer('https://tfhub.dev/google/LaBSE/2')
110
+
111
+ # Read the data
112
+ import pandas as pd
113
+ data = pd.read_excel(UIDatasetURL)
114
+
115
+ columns = ['text', 'intent', 'process']
116
+ data.columns = columns
117
+
118
+ data = data[data["process"] == trainedProcess].drop(columns="process")
119
+
120
+ """#### Category merging"""
121
+
122
+ # Convert categories to codes
123
+ data['intent'] = data['intent'].astype('category')
124
+ data['intent_codes'] = data['intent'].cat.codes
125
+
126
+ # Display the distribution of codes
127
+ values = data['intent'].value_counts()
128
+ plt.stem(values)
129
+
130
+ """#### Normalize data
131
+
132
+ ### Text preprocessing
133
+
134
+ 1. Remove punctuation
135
+ 2. Lowercase the text
136
+ 3. Apply tokenization
137
+ 4. Remove stopwords
138
+ 5. Apply lemmatizer
139
+ """
140
+
141
+ import string
142
+ import re
143
+ import nltk
144
+ import text_hr
145
+
146
+ nltk.download('stopwords')
147
+ nltk.download('wordnet')
148
+ nltk.download('omw-1.4')
149
+ from nltk.stem.porter import PorterStemmer
150
+ from nltk.stem import WordNetLemmatizer
151
+
152
+ def remove_punctuation(text):
153
+ return "".join([i for i in text if i not in string.punctuation])
154
+
155
+ def tokenization(text):
156
+ return re.split(r"\s+",text)
157
+
158
+ stopwords = nltk.corpus.stopwords.words('english')
159
+ def remove_stopwords(text):
160
+ return [i for i in text if i not in stopwords]
161
+
162
+ porter_stemmer = PorterStemmer()
163
+ def stemming(text):
164
+ return [porter_stemmer.stem(word) for word in text]
165
+
166
+ wordnet_lemmatizer = WordNetLemmatizer()
167
+ def lemmatizer(text):
168
+ return [wordnet_lemmatizer.lemmatize(word) for word in text]
169
+
170
+ data['text'] = data['text']\
171
+ .apply(lambda x: remove_punctuation(x))\
172
+ .apply(lambda x: x.lower())\
173
+ .apply(lambda x: tokenization(x))\
174
+ .apply(lambda x: lemmatizer(x))
175
+
176
+ stop_words_list_hr = []
177
+ for word_base, l_key, cnt, _suff_id, wform_key, wform in text_hr.get_all_std_words():
178
+ if word_base is not None: stop_words_list_hr.append(word_base)
179
+ if wform is not None: stop_words_list_hr.append(wform)
180
+
181
+ stop_words_list_hr = list(dict.fromkeys(stop_words_list_hr))
182
+
183
+ def remove_stopwords_hr(text):
184
+ output = [i for i in text if i not in stop_words_list_hr]
185
+ return output
186
+
187
+ data['text'] = data['text'].apply(lambda x: remove_stopwords_hr(x))
188
+
189
+ data['text'] = data['text'].str.join(" ")
190
+
191
+ """### Split validation and training data
192
+
193
+ Train 75%, validation 25%
194
+ """
195
+
196
+ codes = data['intent_codes'].unique()
197
+
198
+ # Variable to understand the meaning behind codes
199
+ CODES_REPR = data[["intent_codes", "intent"]].drop_duplicates().sort_values("intent_codes")
200
+
201
+
202
+ def codeToIntent(prediction) -> str:
203
+ """ Returns the intent of the prediction, not the code """
204
+ return CODES_REPR[CODES_REPR["intent_codes"] == prediction.argmax()].iloc[0]["intent"]
205
+
206
+ preprocessed_validation_data = pd.DataFrame(columns=data.columns)
207
+ preprocessed_train_data = pd.DataFrame(columns=data.columns)
208
+
209
+ for c in codes:
210
+ sample = data[data['intent_codes'] == c]
211
+ sample = sample.sample(frac=1)
212
+ # val = sample.sample(frac=0.25)
213
+ val = sample.sample(frac=0)
214
+ train = pd.concat([sample, val]).drop_duplicates(keep=False)
215
+ preprocessed_validation_data = preprocessed_validation_data.append(val, ignore_index=True)
216
+ preprocessed_train_data = preprocessed_train_data.append(train, ignore_index=True)
217
+
218
+ # Preprocessed google translation data
219
+ train_data_eng = preprocessed_train_data[['text', 'intent_codes']]
220
+ train_data_eng.columns = ['text', 'intent_codes']
221
+
222
+ validation_data_eng = preprocessed_validation_data[['text', 'intent_codes']]
223
+ validation_data_eng.columns = ['text', 'intent_codes']
224
+
225
+ def df_to_dataset(df, shuffle=True, batch_size=16):
226
+ df = df.copy()
227
+ labels = df.pop('intent_codes')
228
+ lables_cat = tf.keras.utils.to_categorical(labels, label_size)
229
+ dataset = tf.data.Dataset.from_tensor_slices((dict(df), lables_cat))
230
+ if shuffle:
231
+ dataset = dataset.shuffle(buffer_size=len(df))
232
+ dataset = dataset.batch(batch_size).prefetch(batch_size)
233
+ return dataset
234
+
235
+ _validation = train_data_eng
236
+ train_data_eng = df_to_dataset(train_data_eng)
237
+
238
+ # validation_data_eng = df_to_dataset(validation_data_eng)
239
+ validation_data_eng = df_to_dataset(_validation)
240
+
241
+ """### Model definition and training
242
+
243
+ 2 epochs training (testing purposes)
244
+ """
245
+
246
+ # Model builder
247
+ def model_build():
248
+ inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
249
+ encoded_input = preprocessor(inputs)
250
+ encoder_outputs = model(encoded_input)
251
+
252
+ x = encoder_outputs['pooled_output']
253
+ x = tf.keras.layers.Dropout(0.1)(x)
254
+ x = tf.keras.layers.Dense(128, activation='relu')(x)
255
+ x = tf.keras.layers.Dropout(0.7)(x)
256
+ outputs = tf.keras.layers.Dense(label_size, activation='softmax', name='classifier')(x)
257
+
258
+ return tf.keras.Model(inputs, outputs)
259
+
260
+ # Build a model with preprocessed data
261
+ model_eng = model_build()
262
+ model_eng.compile(
263
+ optimizer = tf.keras.optimizers.Adam(0.001),
264
+ loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
265
+ metrics = tf.keras.metrics.CategoricalAccuracy()
266
+ )
267
+
268
+ eng_history = model_eng.fit(
269
+ train_data_eng,
270
+ epochs = training_epochs,
271
+ batch_size = 16,
272
+ validation_data = validation_data_eng,
273
+ )
274
+
275
+ """## Data extraction pipeline"""
276
+
277
+ !pip install transformers
278
+
279
+ from transformers import pipeline
280
+
281
+ pipe = pipeline("token-classification", model="rkrstacic/bpmn-task-extractor")
282
+
283
+ """## Sentence similarity"""
284
+
285
+ !pip install -U sentence-transformers
286
+
287
+ import numpy as np
288
+ from typing import List, Dict
289
+
290
+ # Function that shows the result
291
+ def predictNER(text: str) -> Dict:
292
+ currentString = "".join([x["word"] for x in pipe(text) if x["entity"] != "LABEL_0"])
293
+
294
+ # Return dictionary without empty values
295
+ return { "Task": currentString.replace("▁", " ")[1:] }
296
+
297
+ from sentence_transformers import SentenceTransformer, util
298
+
299
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
300
+
301
+ from typing import List
302
+ import torch
303
+
304
+ def getTaskSimilarityIndex(flatIndex: int, tasks) -> int:
305
+ """ Get task index based on the flatten task list """
306
+ for index, task in enumerate(tasks):
307
+ if flatIndex <= len(task["alias"]):
308
+ return index
309
+
310
+ flatIndex -= len(task["alias"]) + 1
311
+
312
+ return -1
313
+
314
+ def getFlattenTasks(tasks) -> List[str]:
315
+ """ Returns the flatten version of task names and their aliases """
316
+ resTasks = []
317
+
318
+ for task in tasks:
319
+ resTasks.append(task["name"])
320
+ resTasks = resTasks + task["alias"]
321
+
322
+ return resTasks
323
+
324
+ def taskSimilarity(text: str, tasks) -> int:
325
+ """ Returns the task index which is the most similar to the text """
326
+ return getTaskSimilarityIndex(torch.argmax(util.pytorch_cos_sim(
327
+ model.encode(text, convert_to_tensor=True),
328
+ model.encode(getFlattenTasks(tasks), convert_to_tensor=True)
329
+ )).item(), tasks)
330
+
331
+ """## Using the user intent model"""
332
+
333
+ def preprocessText(text: str) -> str:
334
+ """ Do the same preprocessing as the UI model training input data """
335
+ text = remove_punctuation(text)
336
+ text = text.lower()
337
+ text = tokenization(text)
338
+ text = lemmatizer(text)
339
+ text = remove_stopwords_hr(text)
340
+
341
+ return " ".join(text)
342
+
343
+ def predict_intent(text: str) -> str:
344
+ """ Predict the text intent based on the abovetrained model """
345
+ return codeToIntent(model_eng.predict([preprocessText(text)], verbose=False))
346
+
347
+ def getPhases(phases) -> str:
348
+ """ P1: Returns the formatted phases """
349
+ phases = [phase["name"].lower() for phase in phases]
350
+ return ', '.join(phases[:-1]) + ' i ' + phases[-1]
351
+
352
+ # Define functions that handle output text formatting
353
+
354
+ def getP1String(process) -> str:
355
+ return f"Faze procesa za proces '{process['name']}' su: {getPhases(process['phases'])}"
356
+
357
+ def getP2String(process) -> str:
358
+ return f"Proces '{process['name']}' traje {process['duration']}"
359
+
360
+ def getP3String(taskName: str, task) -> str:
361
+ return f"Kratki opis '{taskName}': {task['description']}"
362
+
363
+ def getP4String(taskName: str, task) -> str:
364
+ return f"Proces '{taskName}' traje {task['duration']}"
365
+
366
+ def getP5String(taskIndex: int, taskName: str, process) -> str:
367
+ if len(process["phases"]) <= taskIndex + 1:
368
+ return f"'{taskName}' je zadnji korak u procesu '{process['name']}'"
369
+
370
+ return f"Nakon '{taskName}' je '{process['phases'][taskIndex + 1]['name'].lower()}'"
371
+
372
+ def getP6String() -> str:
373
+ return "Nažalost, ne razumijem Vaše pitanje"
374
+
375
+ def print_result(text: str, process) -> None:
376
+ """ Chatbot output messages based on intent """
377
+ intent = predict_intent(text)
378
+ taskIndex = taskSimilarity(text, process["phases"])
379
+ task = process["phases"][taskIndex]
380
+ taskName = task["name"].lower()
381
+
382
+ # P1: Koje su faze
383
+ if intent == 'P1':
384
+ return(getP1String(process))
385
+
386
+ # P2: Koliko traje cijeli proces
387
+ elif intent == 'P2':
388
+ return(getP2String(process))
389
+
390
+ # P3: Kako ide odabir preferencija?
391
+ elif intent == 'P3':
392
+ return(getP3String(taskName, task))
393
+
394
+ # P4: Koliko traje {task}
395
+ elif intent == 'P4':
396
+ return(getP4String(taskName, task))
397
+
398
+ # P5: Što je nakon {task}
399
+ elif intent == 'P5':
400
+ return(getP5String(taskIndex, taskName, process))
401
+
402
+ # Ništa od navedenog
403
+ else:
404
+ return(getP6String())
405
+
406
+ def chatbot(input_text) -> None:
407
+ """ By: Rafael Krstačić """
408
+ processName = trainedProcessJSON
409
+ currentProcess = None
410
+
411
+ for process in json:
412
+ if process["name"] == processName:
413
+ currentProcess = process
414
+ break
415
+ else:
416
+ raise KeyError("Process does not exist in json")
417
+
418
+ return print_result(input_text, currentProcess)
419
+
420
+ """## Gradio app"""
421
+
422
+ chatbot("Koliko traje predaja dnevnika prakse")
423
+
424
+ iface = gr.Interface(
425
+ fn=chatbot,
426
+ inputs="text",
427
+ outputs=["text"],
428
+ title="Sentiment Analysis"
429
+ )
430
+
431
+ iface.launch()