wissamantoun commited on
Commit
3831b68
·
1 Parent(s): 7bdb6b8

formatting

Browse files
Files changed (3) hide show
  1. backend/sa.py +55 -4
  2. backend/services.py +4 -2
  3. backend/utils.py +5 -0
backend/sa.py CHANGED
@@ -13,10 +13,61 @@ predictor = load_text_generator()
13
 
14
 
15
  def write():
16
- input_text = st.text_input("Enter your text here:", key="Fuck you")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  if st.button("Predict"):
18
  with st.spinner("Predicting..."):
19
  prediction, score, all_score = predictor.predict([input_text])
20
- st.write(f"Prediction: {prediction}")
21
- st.write(f"Score: {score}")
22
- st.write(f"All scores: {all_score}")
 
 
 
 
 
 
13
 
14
 
15
  def write():
16
+ st.markdown(
17
+ """
18
+ # Arabic Sentiment Analysis
19
+
20
+ This is a simple sentiment analysis app that uses the prediction kernel from Wissam's (me) submission that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
21
+ """
22
+ )
23
+ if st.checkbox("More info: "):
24
+ st.markdown(
25
+ """
26
+ ###Submission Description:
27
+
28
+ My submission is based on an ensemble of 5 models with varying preprocessing, and classifier design. All model variants are built over MARBERT [?], which is a BERT-based model pre-trained on 1B dialectal Arabic tweets.
29
+
30
+ For preprocessing, all models shared the following steps:
31
+ - Replacing user mentions with “USER” and links with “URL”
32
+ - Replacing the “#” with “HASH”
33
+ - Removed the underscore character since it is missing the MARBERT vocabulary.
34
+ - Removed diacritics and elongations (tatweel)
35
+ - Spacing out emojis
36
+
37
+ For classifier design, all models use a dense layer on top of MARBERT unless otherwise specified. Model training is done by hyperparameter grid-search with 5-fold cross-validation with the following search space:
38
+ - Learning rate: [2e-5,3e-5,4e-5]
39
+ - Batch size: 128
40
+ - Maximum sequence length: 64
41
+ - Epochs: 3 (we select the best epoch for the final prediction)
42
+ - Warmup ratio: [0,0.1]
43
+ - Seed: [1,25,42,123,666]
44
+
45
+ Model I is a vanilla variant with only the preprocessing steps mention above applied. Model II enhances the emoji representation by replacing OOV emojis with ones that have similar meaning, for example 💊  😷.
46
+ We noticed the repetitive use of “السلام عليكم” and “ورحمة الله وبركاته” in neutral tweets, especially when users were directing questions to business accounts. This could confuse the classifier, if it encountered these words in a for example a negative tweet, hence in Model III we removed variation of the phrase mentioned before using fuzzy matching algorithms.
47
+
48
+ In Model IV, we tried to help the model by appending a sarcasm label to the input. We first trained a separate MARBERT on the ArSarcasm [?] dataset, and then used it to label the training and test sets.
49
+
50
+ Model V uses the vanilla preprocessing approach, but instead of a dense layer built on top of MARBERT, we follow the approach detailed by Safaya et.al. [?] which uses a CNN-based classifier instead.
51
+
52
+ For the final prediction, we first average the predictions of the 5 models from cross-validation (this is done for each model separately), we then average the results from the 5 model variants. We observed that the distribution of the predicted sentiment classes, doesn’t quite match the true distribution, this is due to the model preferring the neutral class over the positive class. To counter that, we apply what we call Label-Weighted average where during after the final averaging we rescale the score with the following weights 1.57,0.98 and 0.93 for positive, neutral, and negative (note that the weights were determined empirically).
53
+
54
+ 1- https://aclanthology.org/2021.acl-long.551/
55
+ 2- https://github.com/iabufarha/ArSarcasm
56
+ 3- https://github.com/alisafaya/OffensEval2020
57
+
58
+ """
59
+ )
60
+ input_text = st.text_input(
61
+ "Enter your text here:",
62
+ )
63
  if st.button("Predict"):
64
  with st.spinner("Predicting..."):
65
  prediction, score, all_score = predictor.predict([input_text])
66
+ st.write(f"Result: {prediction[0]}")
67
+ detailed_score = {
68
+ "Positive": all_score[0][0],
69
+ "Neutral": all_score[0][1],
70
+ "Negative": all_score[0][2],
71
+ }
72
+ st.write("All scores:")
73
+ st.write(detailed_score)
backend/services.py CHANGED
@@ -11,7 +11,7 @@ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, set_seed
11
  from .modeling_gpt2 import GPT2LMHeadModel as GROVERLMHeadModel
12
  from .preprocess import ArabertPreprocessor
13
  from .sa_utils import *
14
- from .utils import download_models
15
 
16
  # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
17
  class TextGeneration:
@@ -344,6 +344,8 @@ class SentimentAnalyzer:
344
  id_label_map[np.argmax([pos_score, neu_score, neg_score])]
345
  )
346
  final_ensemble_score.append(np.max([pos_score, neu_score, neg_score]))
347
- final_ensemble_all_score.append((pos_score, neu_score, neg_score))
 
 
348
 
349
  return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score
 
11
  from .modeling_gpt2 import GPT2LMHeadModel as GROVERLMHeadModel
12
  from .preprocess import ArabertPreprocessor
13
  from .sa_utils import *
14
+ from .utils import download_models, softmax
15
 
16
  # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
17
  class TextGeneration:
 
344
  id_label_map[np.argmax([pos_score, neu_score, neg_score])]
345
  )
346
  final_ensemble_score.append(np.max([pos_score, neu_score, neg_score]))
347
+ final_ensemble_all_score.append(
348
+ softmax(np.array([pos_score, neu_score, neg_score])).tolist()
349
+ )
350
 
351
  return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score
backend/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import psutil
2
  import os
3
  from tqdm.auto import tqdm
@@ -35,3 +36,7 @@ def download_models(models):
35
  os.system(
36
  f"wget -q https://huggingface.co/researchaccount/{model}/resolve/main/train_{i}/best_model/vocab.txt -P {curr_dir}"
37
  )
 
 
 
 
 
1
+ import numpy as np
2
  import psutil
3
  import os
4
  from tqdm.auto import tqdm
 
36
  os.system(
37
  f"wget -q https://huggingface.co/researchaccount/{model}/resolve/main/train_{i}/best_model/vocab.txt -P {curr_dir}"
38
  )
39
+
40
+
41
+ def softmax(x):
42
+ return np.exp(x) / sum(np.exp(x))