dperales commited on
Commit
b2fbe3d
1 Parent(s): c0b980f

Upload 12 files

Browse files
OCR_Detector.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ import easyocr
5
+ import PIL
6
+ from PIL import Image, ImageDraw
7
+
8
+ class OCRDetector:
9
+
10
+ def __init__(self):
11
+ # it will only detect the English and Spanish part of the image as text
12
+ self.reader = easyocr.Reader(['es','en'], gpu=False)
Object_Detector.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tensorflow as tf
3
+ import tensorflow_hub as hub
4
+ # Load compressed models from tensorflow_hub
5
+ os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
6
+
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib as mpl
9
+
10
+ # For drawing onto the image.
11
+ import numpy as np
12
+ from tensorflow.python.ops.numpy_ops import np_config
13
+ np_config.enable_numpy_behavior()
14
+ from PIL import Image
15
+ from PIL import ImageColor
16
+ from PIL import ImageDraw
17
+ from PIL import ImageFont
18
+ import time
19
+
20
+ import streamlit as st
21
+
22
+ # For measuring the inference time.
23
+ import time
24
+
25
+
26
+ class ObjectDetector:
27
+
28
+ def __init__(self):
29
+ # Load Tokenizer & Model
30
+ # hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
31
+ # self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
32
+ # self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
33
+
34
+ # Change model labels in config
35
+ # self.model.config.id2label[0] = "Negative"
36
+ # self.model.config.id2label[1] = "Neutral"
37
+ # self.model.config.id2label[2] = "Positive"
38
+ # self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
39
+ # self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
40
+ # self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
41
+
42
+ # Instantiate explainer
43
+ # self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
44
+
45
+ # module_handle = "https://tfhub.dev/google/faster_rcnn/openimages_v4/inception_resnet_v2/1"
46
+ module_handle = "https://tfhub.dev/google/openimages_v4/ssd/mobilenet_v2/1"
47
+ self.detector = hub.load(module_handle).signatures['default']
48
+
49
+ def run_detector(self, path):
50
+ img = path
51
+
52
+ converted_img = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]
53
+
54
+ start_time = time.time()
55
+ result = self.detector(converted_img)
56
+ end_time = time.time()
57
+
58
+ result = {key:value.numpy() for key,value in result.items()}
59
+
60
+ primer = format(result["detection_class_entities"][0]) + ' ' + format(round(result["detection_scores"][0]*100)) + '%'
61
+
62
+ image_with_boxes = self.draw_boxes(
63
+ img, result["detection_boxes"],
64
+ result["detection_class_entities"], result["detection_scores"])
65
+
66
+ # display_image(image_with_boxes)
67
+ return image_with_boxes, primer
68
+
69
+ def display_image(self, image):
70
+ fig = plt.figure(figsize=(20, 15))
71
+ plt.grid(False)
72
+ plt.imshow(image)
73
+
74
+ def draw_bounding_box_on_image(self, image,
75
+ ymin,
76
+ xmin,
77
+ ymax,
78
+ xmax,
79
+ color,
80
+ font,
81
+ thickness=4,
82
+ display_str_list=()):
83
+ """Adds a bounding box to an image."""
84
+ draw = ImageDraw.Draw(image)
85
+ im_width, im_height = image.size
86
+ (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
87
+ ymin * im_height, ymax * im_height)
88
+ draw.line([(left, top), (left, bottom), (right, bottom), (right, top),
89
+ (left, top)],
90
+ width=thickness,
91
+ fill=color)
92
+
93
+ # If the total height of the display strings added to the top of the bounding
94
+ # box exceeds the top of the image, stack the strings below the bounding box
95
+ # instead of above.
96
+ display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
97
+ # Each display_str has a top and bottom margin of 0.05x.
98
+ total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
99
+
100
+ if top > total_display_str_height:
101
+ text_bottom = top
102
+ else:
103
+ text_bottom = top + total_display_str_height
104
+ # Reverse list and print from bottom to top.
105
+ for display_str in display_str_list[::-1]:
106
+ text_width, text_height = font.getsize(display_str)
107
+ margin = np.ceil(0.05 * text_height)
108
+ draw.rectangle([(left, text_bottom - text_height - 2 * margin),
109
+ (left + text_width, text_bottom)],
110
+ fill=color)
111
+ draw.text((left + margin, text_bottom - text_height - margin),
112
+ display_str,
113
+ fill="black",
114
+ font=font)
115
+ text_bottom -= text_height - 2 * margin
116
+
117
+ def draw_boxes(self, image, boxes, class_names, scores, max_boxes=10, min_score=0.4):
118
+ """Overlay labeled boxes on an image with formatted scores and label names."""
119
+ colors = list(ImageColor.colormap.values())
120
+
121
+ try:
122
+ font = ImageFont.truetype("./Roboto-Light.ttf", 24)
123
+
124
+ except IOError:
125
+ print("Font not found, using default font.")
126
+ font = ImageFont.load_default()
127
+
128
+ for i in range(min(boxes.shape[0], max_boxes)):
129
+ if scores[i] >= min_score:
130
+ ymin, xmin, ymax, xmax = tuple(boxes[i])
131
+ display_str = "{}: {}%".format(class_names[i].decode("ascii"),
132
+ int(100 * scores[i]))
133
+ color = colors[hash(class_names[i]) % len(colors)]
134
+ image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
135
+ self.draw_bounding_box_on_image(
136
+ image_pil,
137
+ ymin,
138
+ xmin,
139
+ ymax,
140
+ xmax,
141
+ color,
142
+ font,
143
+ display_str_list=[display_str])
144
+ np.copyto(image, np.array(image_pil))
145
+ return image
146
+
Roboto-Light.ttf ADDED
Binary file (170 kB). View file
 
appv2.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ import easyocr
5
+ import streamlit as st
6
+ from annotated_text import annotated_text
7
+ from streamlit_option_menu import option_menu
8
+ from sentiment_analysis import SentimentAnalysis
9
+ from keyword_extraction import KeywordExtractor
10
+ from part_of_speech_tagging import POSTagging
11
+ from emotion_detection import EmotionDetection
12
+ from named_entity_recognition import NamedEntityRecognition
13
+ from Object_Detector import ObjectDetector
14
+ from OCR_Detector import OCRDetector
15
+ import PIL
16
+ from PIL import Image
17
+ from PIL import ImageColor
18
+ from PIL import ImageDraw
19
+ from PIL import ImageFont
20
+ import time
21
+
22
+ # Imports de Object Detection
23
+ import tensorflow as tf
24
+ import tensorflow_hub as hub
25
+ # Load compressed models from tensorflow_hub
26
+ os.environ['TFHUB_MODEL_LOAD_FORMAT'] = 'COMPRESSED'
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib as mpl
29
+ # For drawing onto the image.
30
+ import numpy as np
31
+ from tensorflow.python.ops.numpy_ops import np_config
32
+ np_config.enable_numpy_behavior()
33
+
34
+ import torch
35
+ import librosa
36
+ from models import infere_speech_emotion, infere_text_emotion, infere_voice2text
37
+
38
+ st.set_page_config(layout="wide")
39
+
40
+ hide_streamlit_style = """
41
+ <style>
42
+ #MainMenu {visibility: hidden;}
43
+ footer {visibility: hidden;}
44
+ </style>
45
+ """
46
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
47
+
48
+ @st.cache_resource
49
+ def load_sentiment_model():
50
+ return SentimentAnalysis()
51
+
52
+ @st.cache_resource
53
+ def load_keyword_model():
54
+ return KeywordExtractor()
55
+
56
+ @st.cache_resource
57
+ def load_pos_model():
58
+ return POSTagging()
59
+
60
+ @st.cache_resource
61
+ def load_emotion_model():
62
+ return EmotionDetection()
63
+
64
+ @st.cache_resource
65
+ def load_ner_model():
66
+ return NamedEntityRecognition()
67
+
68
+ @st.cache_resource
69
+ def load_objectdetector_model():
70
+ return ObjectDetector()
71
+
72
+ @st.cache_resource
73
+ def load_ocrdetector_model():
74
+ return OCRDetector()
75
+
76
+ sentiment_analyzer = load_sentiment_model()
77
+ keyword_extractor = load_keyword_model()
78
+ pos_tagger = load_pos_model()
79
+ emotion_detector = load_emotion_model()
80
+ ner = load_ner_model()
81
+ objectdetector1 = load_objectdetector_model()
82
+ ocrdetector1 = load_ocrdetector_model()
83
+
84
+ def rectangle(image, result):
85
+ draw = ImageDraw.Draw(image)
86
+ for res in result:
87
+ top_left = tuple(res[0][0]) # top left coordinates as tuple
88
+ bottom_right = tuple(res[0][2]) # bottom right coordinates as tuple
89
+ draw.rectangle((top_left, bottom_right), outline="blue", width=2)
90
+ st.image(image)
91
+
92
+ example_text = "My name is Daniel: The attention to detail, swift resolution, and accuracy demonstrated by ITACA Insurance Company in Spain in handling my claim were truly impressive. This undoubtedly reflects their commitment to being a customer-centric insurance provider."
93
+
94
+ with st.sidebar:
95
+ image = Image.open('./itaca_logo.png')
96
+ st.image(image,width=150) #use_column_width=True)
97
+ page = option_menu(menu_title='Menu',
98
+ menu_icon="robot",
99
+ options=["Sentiment Analysis",
100
+ "Keyword Extraction",
101
+ "Part of Speech Tagging",
102
+ "Emotion Detection",
103
+ "Named Entity Recognition",
104
+ "Speech & Text Emotion",
105
+ "Object Detector",
106
+ "OCR Detector"],
107
+ icons=["chat-dots",
108
+ "key",
109
+ "tag",
110
+ "emoji-heart-eyes",
111
+ "building",
112
+ "book",
113
+ "camera",
114
+ "list-task"],
115
+ default_index=0
116
+ )
117
+
118
+ st.title('ITACA Insurance Core AI Module')
119
+
120
+ # Replace '20px' with your desired font size
121
+ font_size = '20px'
122
+
123
+ if page == "Sentiment Analysis":
124
+ st.header('Sentiment Analysis')
125
+ # st.markdown("![Alt Text](https://media.giphy.com/media/XIqCQx02E1U9W/giphy.gif)")
126
+ st.write(
127
+ """
128
+ """
129
+ )
130
+
131
+ text = st.text_area("Paste text here", value=example_text)
132
+
133
+ if st.button('🔥 Run!'):
134
+ with st.spinner("Loading..."):
135
+ preds, html = sentiment_analyzer.run(text)
136
+ st.success('All done!')
137
+ st.write("")
138
+ st.subheader("Sentiment Predictions")
139
+ st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
140
+ st.write("")
141
+ st.subheader("Sentiment Justification")
142
+ raw_html = html._repr_html_()
143
+ st.components.v1.html(raw_html, height=500)
144
+
145
+ elif page == "Keyword Extraction":
146
+ st.header('Keyword Extraction')
147
+ # st.markdown("![Alt Text](https://media.giphy.com/media/xT9C25UNTwfZuk85WP/giphy-downsized-large.gif)")
148
+ st.write(
149
+ """
150
+ """
151
+ )
152
+
153
+ text = st.text_area("Paste text here", value=example_text)
154
+
155
+ max_keywords = st.slider('# of Keywords Max Limit', min_value=1, max_value=10, value=5, step=1)
156
+
157
+ if st.button('🔥 Run!'):
158
+ with st.spinner("Loading..."):
159
+ annotation, keywords = keyword_extractor.generate(text, max_keywords)
160
+ st.success('All done!')
161
+
162
+ if annotation:
163
+ st.subheader("Keyword Annotation")
164
+ st.write("")
165
+ annotated_text(*annotation)
166
+ st.text("")
167
+
168
+ st.subheader("Extracted Keywords")
169
+ st.write("")
170
+ df = pd.DataFrame(keywords, columns=['Extracted Keywords'])
171
+ csv = df.to_csv(index=False).encode('utf-8')
172
+ st.download_button('Download Keywords to CSV', csv, file_name='news_intelligence_keywords.csv')
173
+
174
+ data_table = st.table(df)
175
+
176
+ elif page == "Part of Speech Tagging":
177
+ st.header('Part of Speech Tagging')
178
+ # st.markdown("![Alt Text](https://media.giphy.com/media/WoWm8YzFQJg5i/giphy.gif)")
179
+ st.write(
180
+ """
181
+ """
182
+ )
183
+
184
+ text = st.text_area("Paste text here", value=example_text)
185
+
186
+ if st.button('🔥 Run!'):
187
+ with st.spinner("Loading..."):
188
+ preds = pos_tagger.classify(text)
189
+ st.success('All done!')
190
+ st.write("")
191
+ st.subheader("Part of Speech tags")
192
+ annotated_text(*preds)
193
+ st.write("")
194
+ st.components.v1.iframe('https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html', height=1000)
195
+
196
+ elif page == "Emotion Detection":
197
+ st.header('Emotion Detection')
198
+ # st.markdown("![Alt Text](https://media.giphy.com/media/fU8X6ozSszyEw/giphy.gif)")
199
+ st.write(
200
+ """
201
+ """
202
+ )
203
+
204
+ text = st.text_area("Paste text here", value=example_text)
205
+
206
+ if st.button('🔥 Run!'):
207
+ with st.spinner("Loading..."):
208
+ preds, html = emotion_detector.run(text)
209
+ st.success('All done!')
210
+ st.write("")
211
+ st.subheader("Emotion Predictions")
212
+ st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
213
+ raw_html = html._repr_html_()
214
+ st.write("")
215
+ st.subheader("Emotion Justification")
216
+ st.components.v1.html(raw_html, height=500)
217
+
218
+ elif page == "Named Entity Recognition":
219
+ st.header('Named Entity Recognition')
220
+ # st.markdown("![Alt Text](https://media.giphy.com/media/lxO8wdWdu4tig/giphy.gif)")
221
+ st.write(
222
+ """
223
+ """
224
+ )
225
+
226
+ text = st.text_area("Paste text here", value=example_text)
227
+
228
+ if st.button('🔥 Run!'):
229
+ with st.spinner("Loading..."):
230
+ preds, ner_annotation = ner.classify(text)
231
+ st.success('All done!')
232
+ st.write("")
233
+ st.subheader("NER Predictions")
234
+ annotated_text(*ner_annotation)
235
+ st.write("")
236
+ st.subheader("NER Prediction Metadata")
237
+ st.write(preds)
238
+
239
+ elif page == "Object Detector":
240
+ st.header('Object Detector')
241
+ st.write(
242
+ """
243
+ """
244
+ )
245
+
246
+ img_file_buffer = st.file_uploader("Load an image", type=["png", "jpg", "jpeg"])
247
+ if img_file_buffer is not None:
248
+ image = np.array(Image.open(img_file_buffer))
249
+
250
+ if st.button('🔥 Run!'):
251
+ with st.spinner("Loading..."):
252
+ img, primero = objectdetector1.run_detector(image)
253
+ st.success('The first image detected is: ' + primero)
254
+ st.image(img, caption="Imagen", use_column_width=True)
255
+
256
+ elif page == "OCR Detector":
257
+ st.header('OCR Detector')
258
+ st.write(
259
+ """
260
+ """
261
+ )
262
+
263
+ file = st.file_uploader("Load an image", type=["png", "jpg", "jpeg"])
264
+
265
+ #read the csv file and display the dataframe
266
+ if file is not None:
267
+ image = Image.open(file) # read image with PIL library
268
+
269
+ if st.button('🔥 Run!'):
270
+ with st.spinner("Loading..."):
271
+ result = ocrdetector1.reader.readtext(np.array(image)) # turn image to numpy array
272
+
273
+ # collect the results in dictionary:
274
+ textdic_easyocr = {}
275
+ for idx in range(len(result)):
276
+ pred_coor = result[idx][0]
277
+ pred_text = result[idx][1]
278
+ pred_confidence = result[idx][2]
279
+ textdic_easyocr[pred_text] = {}
280
+ textdic_easyocr[pred_text]['pred_confidence'] = pred_confidence
281
+
282
+ # get boxes on the image
283
+ rectangle(image, result)
284
+
285
+ # create a dataframe which shows the predicted text and prediction confidence
286
+ df = pd.DataFrame.from_dict(textdic_easyocr).T
287
+ st.table(df)
288
+ elif page == "Speech & Text Emotion":
289
+ st.header('Speech & Text Emotion')
290
+ st.write(
291
+ """
292
+ """
293
+ )
294
+ uploaded_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"])
295
+
296
+ if uploaded_file is not None:
297
+ st.audio(uploaded_file, format='audio/' + uploaded_file.type.split('/')[1])
298
+ st.write("Audio file uploaded and playing.")
299
+
300
+ else:
301
+ st.write("Please upload an audio file.")
302
+
303
+ if st.button("Analysis"):
304
+ with st.spinner("Loading..."):
305
+ st.header('Results of the Audio & Text analysis:')
306
+ samples, sample_rate = librosa.load(uploaded_file, sr=16000)
307
+ p_voice2text = infere_voice2text (samples)
308
+ p_speechemotion = infere_speech_emotion(samples)
309
+ p_textemotion = infere_text_emotion(p_voice2text)
310
+ st.subheader("Text from the Audio:")
311
+ st.write(p_voice2text)
312
+ st.write("---")
313
+ st.subheader("Speech emotion:")
314
+ st.write(p_speechemotion)
315
+ st.write("---")
316
+ st.subheader("Text emotion:")
317
+ st.write(p_textemotion)
318
+ st.write("---")
319
+
320
+
emotion_detection.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from transformers_interpret import SequenceClassificationExplainer
3
+ import torch
4
+ import pandas as pd
5
+
6
+
7
+ class EmotionDetection:
8
+ """
9
+ Emotion Detection on text data.
10
+ Attributes:
11
+ tokenizer: An instance of Hugging Face Tokenizer
12
+ model: An instance of Hugging Face Model
13
+ explainer: An instance of SequenceClassificationExplainer from Transformers interpret
14
+ """
15
+
16
+ def __init__(self):
17
+ hub_location = 'cardiffnlp/twitter-roberta-base-emotion'
18
+ self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
19
+ self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
20
+ self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
21
+
22
+ def justify(self, text):
23
+ """
24
+ Get html annotation for displaying emotion justification over text.
25
+ Parameters:
26
+ text (str): The user input string to emotion justification
27
+ Returns:
28
+ html (hmtl): html object for plotting emotion prediction justification
29
+ """
30
+
31
+ word_attributions = self.explainer(text)
32
+ html = self.explainer.visualize("example.html")
33
+
34
+ return html
35
+
36
+ def classify(self, text):
37
+ """
38
+ Recognize Emotion in text.
39
+ Parameters:
40
+ text (str): The user input string to perform emotion classification on
41
+ Returns:
42
+ predictions (str): The predicted probabilities for emotion classes
43
+ """
44
+
45
+ tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
46
+ outputs = self.model(**tokens)
47
+ probs = torch.nn.functional.softmax(outputs[0], dim=-1)
48
+ probs = probs.mean(dim=0).detach().numpy()
49
+ labels = list(self.model.config.id2label.values())
50
+ preds = pd.Series(probs, index=labels, name='Predicted Probability')
51
+
52
+ return preds
53
+
54
+ def run(self, text):
55
+ """
56
+ Classify and Justify Emotion in text.
57
+ Parameters:
58
+ text (str): The user input string to perform emotion classification on
59
+ Returns:
60
+ predictions (str): The predicted probabilities for emotion classes
61
+ html (hmtl): html object for plotting emotion prediction justification
62
+ """
63
+
64
+ preds = self.classify(text)
65
+ html = self.justify(text)
66
+
67
+ return preds, html
itaca_logo.png ADDED
keyword_extraction.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import pytextrank
3
+ import re
4
+ from operator import itemgetter
5
+ import en_core_web_sm
6
+
7
+
8
+ class KeywordExtractor:
9
+ """
10
+ Keyword Extraction on text data
11
+ Attributes:
12
+ nlp: An instance English pipeline optimized for CPU for spacy
13
+ """
14
+
15
+ def __init__(self):
16
+ self.nlp = en_core_web_sm.load()
17
+ self.nlp.add_pipe("textrank")
18
+
19
+ def get_keywords(self, text, max_keywords):
20
+ """
21
+ Extract keywords from text.
22
+ Parameters:
23
+ text (str): The user input string to extract keywords from
24
+ Returns:
25
+ kws (list): list of extracted keywords
26
+ """
27
+
28
+ doc = self.nlp(text)
29
+
30
+ kws = [i.text for i in doc._.phrases[:max_keywords]]
31
+
32
+ return kws
33
+
34
+ def get_keyword_indices(self, kws, text):
35
+ """
36
+ Extract keywords from text.
37
+ Parameters:
38
+ kws (list): list of extracted keywords
39
+ text (str): The user input string to extract keywords from
40
+ Returns:
41
+ keyword_indices (list): list of indices for keyword boundaries in text
42
+ """
43
+
44
+ keyword_indices = []
45
+ for s in kws:
46
+ indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
47
+ keyword_indices.extend(indices)
48
+
49
+ return keyword_indices
50
+
51
+ def merge_overlapping_indices(self, keyword_indices):
52
+ """
53
+ Merge overlapping keyword indices.
54
+ Parameters:
55
+ keyword_indices (list): list of indices for keyword boundaries in text
56
+ Returns:
57
+ keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
58
+ """
59
+
60
+ # Sort the array on the basis of start values of intervals.
61
+ keyword_indices.sort()
62
+
63
+ stack = []
64
+ # insert first interval into stack
65
+ stack.append(keyword_indices[0])
66
+ for i in keyword_indices[1:]:
67
+ # Check for overlapping interval,
68
+ # if interval overlap
69
+ if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
70
+ stack[-1][-1] = max(stack[-1][-1], i[-1])
71
+ else:
72
+ stack.append(i)
73
+ return stack
74
+
75
+ def merge_until_finished(self, keyword_indices):
76
+ """
77
+ Loop until no overlapping keyword indices left.
78
+ Parameters:
79
+ keyword_indices (list): list of indices for keyword boundaries in text
80
+ Returns:
81
+ keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
82
+ """
83
+
84
+ len_indices = 0
85
+ while True:
86
+ # Merge overlapping indices
87
+ merged = self.merge_overlapping_indices(keyword_indices)
88
+ # Check to see if merging reduced number of annotation indices
89
+ # If merging did not reduce list return final indicies
90
+ if len_indices == len(merged):
91
+ out_indices = sorted(merged, key=itemgetter(0))
92
+ return out_indices
93
+ else:
94
+ len_indices = len(merged)
95
+
96
+ def get_annotation(self, text, keyword_indices):
97
+ """
98
+ Create text annotation for extracted keywords.
99
+ Parameters:
100
+ keyword_indices (list): list of indices for keyword boundaries in text
101
+ Returns:
102
+ annotation (list): list of tuples for generating html
103
+ """
104
+
105
+ # Turn list to numpy array
106
+ arr = list(text)
107
+
108
+ # Loop through indices in list and insert delimeters
109
+ for idx in sorted(keyword_indices, reverse=True):
110
+ arr.insert(idx[0], "<kw>")
111
+ arr.insert(idx[1]+1, "<!kw> <kw>")
112
+
113
+ # join array
114
+ joined_annotation = ''.join(arr)
115
+
116
+ # split array on delimeter
117
+ split = joined_annotation.split('<kw>')
118
+
119
+ # Create annotation for keywords in text
120
+ annotation = [(x.replace('<!kw> ', ''), "KEY", "#26aaef") if "<!kw>" in x else x for x in split]
121
+
122
+ return annotation
123
+
124
+ def generate(self, text, max_keywords):
125
+ """
126
+ Create text annotation for extracted keywords.
127
+ Parameters:
128
+ text (str): The user input string to extract keywords from
129
+ max_keywords (int): Limit on number of keywords to generate
130
+ Returns:
131
+ annotation (list): list of tuples for generating html
132
+ kws (list): list of extracted keywords
133
+ """
134
+
135
+ kws = self.get_keywords(text, max_keywords)
136
+
137
+ indices = list(self.get_keyword_indices(kws, text))
138
+ if indices:
139
+ indices_merged = self.merge_until_finished(indices)
140
+ annotation = self.get_annotation(text, indices_merged)
141
+ else:
142
+ annotation = None
143
+
144
+ return annotation, kws
145
+
models.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import the necessary libraries
2
+ from transformers import pipeline
3
+
4
+ # Initialize the text classification model with a pre-trained model
5
+ model_text_emotion = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
6
+
7
+ # Initialize the audio classification model with a pre-trained SER model
8
+ model_speech_emotion = pipeline("audio-classification", model="aherzberg/ser_model_fixed_label")
9
+
10
+ # Initialize the automatic speech recognition model with a pre-trained model that is capable of converting speech to text
11
+ model_voice2text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
12
+
13
+ # A function that uses the initialized text classification model to predict the emotion of a given text input
14
+ def infere_text_emotion(text):
15
+ return model_text_emotion(text)[0]["label"].capitalize()
16
+
17
+ # A function that uses the initialized audio classification model to predict the emotion of a given speech input
18
+ def infere_speech_emotion(text):
19
+ # Dict that maps the speech model emotions with the text's ones
20
+ emotions_dict = {"angry": "Anger", "disgust": "Disgust", "fear": "Fear", "happy": "Joy", "neutral": "Neutral", "sad": "Sadness"}
21
+ inference = model_speech_emotion(text)[0]["label"]
22
+ return emotions_dict[inference]
23
+
24
+ # A function that uses the initialized automatic speech recognition model to convert speech (as an audio file) to text
25
+ def infere_voice2text(audio_file):
26
+ return model_voice2text(audio_file)["text"]
named_entity_recognition.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
2
+ from transformers import pipeline
3
+
4
+
5
+ class NamedEntityRecognition:
6
+ """
7
+ Named Entity Recognition on text data.
8
+ Attributes:
9
+ tokenizer: An instance of Hugging Face Tokenizer
10
+ model: An instance of Hugging Face Model
11
+ nlp: An instance of Hugging Face Named Entity Recognition pipeline
12
+ """
13
+
14
+ def __init__(self):
15
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
16
+ model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
17
+ self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
18
+
19
+ def get_annotation(self, preds, text):
20
+ """
21
+ Get html annotation for displaying entities over text.
22
+ Parameters:
23
+ preds (dict): List of entities and their associated metadata
24
+ text (str): The user input string to generate entity tags for
25
+ Returns:
26
+ final_annotation (list): List of tuples to pass to text annotation html creator
27
+ """
28
+
29
+ splits = [0]
30
+ entities = {}
31
+ for i in preds:
32
+ splits.append(i['start'])
33
+ splits.append(i['end'])
34
+ entities[i['word']] = i['entity_group']
35
+
36
+ # Exclude bad preds
37
+ exclude = ['', '.', '. ', ' ']
38
+ for x in exclude:
39
+ if x in entities.keys():
40
+ entities.pop(x)
41
+
42
+ parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]
43
+
44
+ final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]
45
+
46
+ return final_annotation
47
+
48
+ def classify(self, text):
49
+ """
50
+ Recognize Named Entities in text.
51
+ Parameters:
52
+ text (str): The user input string to generate entity tags for
53
+ Returns:
54
+ predictions (str): The user input string to generate entity tags for
55
+ ner_annotation (str): The user input string to generate entity tags for
56
+ """
57
+
58
+ preds = self.nlp(text)
59
+ ner_annotation = self.get_annotation(preds, text)
60
+ return preds, ner_annotation
part_of_speech_tagging.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ nltk.download('punkt')
4
+ nltk.download('averaged_perceptron_tagger')
5
+
6
+
7
+ class POSTagging:
8
+ """Part of Speech Tagging on text data"""
9
+
10
+ def __init__(self):
11
+ pass
12
+
13
+ def classify(self, text):
14
+ """
15
+ Generate Part of Speech tags.
16
+ Parameters:
17
+ text (str): The user input string to generate tags for
18
+ Returns:
19
+ predictions (list): list of tuples containing words and their respective tags
20
+ """
21
+
22
+ text = word_tokenize(text)
23
+ predictions = nltk.pos_tag(text)
24
+ return predictions
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pillow
2
+ streamlit==1.21.0
3
+ pandas
4
+ numpy
5
+ matplotlib
6
+ tensorflow
7
+ tensorflow-hub
8
+ scikit-learn
9
+ easyocr
10
+ nltk~=3.5
11
+ typing-extensions
12
+ streamlit-option-menu~=0.3.2
13
+ st-annotated-text~=3.0.0
14
+ transformers-interpret~=0.7.2
15
+ htbuilder==0.6.0
16
+ pytextrank~=3.2.3
17
+ spacy~=3.0.5
18
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl
sentiment_analysis.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from transformers_interpret import SequenceClassificationExplainer
3
+ import torch
4
+ import pandas as pd
5
+
6
+
7
+ class SentimentAnalysis:
8
+ """
9
+ Sentiment on text data.
10
+ Attributes:
11
+ tokenizer: An instance of Hugging Face Tokenizer
12
+ model: An instance of Hugging Face Model
13
+ explainer: An instance of SequenceClassificationExplainer from Transformers interpret
14
+ """
15
+
16
+ def __init__(self):
17
+ # Load Tokenizer & Model
18
+ hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
19
+ #hub_location = 'dccuchile/bert-base-spanish-wwm-uncased'
20
+ self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
21
+ self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
22
+
23
+ # Change model labels in config
24
+ self.model.config.id2label[0] = "Negative"
25
+ self.model.config.id2label[1] = "Neutral"
26
+ self.model.config.id2label[2] = "Positive"
27
+ self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
28
+ self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
29
+ self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
30
+
31
+ # Instantiate explainer
32
+ self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
33
+
34
+ def justify(self, text):
35
+ """
36
+ Get html annotation for displaying sentiment justification over text.
37
+ Parameters:
38
+ text (str): The user input string to sentiment justification
39
+ Returns:
40
+ html (hmtl): html object for plotting sentiment prediction justification
41
+ """
42
+
43
+ word_attributions = self.explainer(text)
44
+ html = self.explainer.visualize("example.html")
45
+
46
+ return html
47
+
48
+ def classify(self, text):
49
+ """
50
+ Recognize Sentiment in text.
51
+ Parameters:
52
+ text (str): The user input string to perform sentiment classification on
53
+ Returns:
54
+ predictions (str): The predicted probabilities for sentiment classes
55
+ """
56
+
57
+ tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
58
+ outputs = self.model(**tokens)
59
+ probs = torch.nn.functional.softmax(outputs[0], dim=-1)
60
+ probs = probs.mean(dim=0).detach().numpy()
61
+ predictions = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
62
+
63
+ return predictions
64
+
65
+ def run(self, text):
66
+ """
67
+ Classify and Justify Sentiment in text.
68
+ Parameters:
69
+ text (str): The user input string to perform sentiment classification on
70
+ Returns:
71
+ predictions (str): The predicted probabilities for sentiment classes
72
+ html (hmtl): html object for plotting sentiment prediction justification
73
+ """
74
+
75
+ predictions = self.classify(text)
76
+ html = self.justify(text)
77
+
78
+ return predictions, html