Spaces:

aubmindlab
/

Arabic-NLP

Running

App Files Files Community

wissamantoun commited on Sep 11, 2021

Commit

0558cbb

1 Parent(s): 854b7af

added Sentiment Analysis

Browse files

Files changed (6) hide show

app.py +2 -0
backend/sa.py +19 -0
backend/sa_utils.py +510 -0
backend/services.py +177 -0
backend/utils.py +10 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import streamlit as st
 import backend.aragpt
 import backend.home
 import backend.processor
 from backend.utils import get_current_ram_usage
 st.set_page_config(
@@ -14,6 +15,7 @@ PAGES = {
     "Home": backend.home,
     "Arabic Text Preprocessor": backend.processor,
     "Arabic Language Generation": backend.aragpt,
 }

 import backend.aragpt
 import backend.home
 import backend.processor
+import backend.sa
 from backend.utils import get_current_ram_usage
 st.set_page_config(
     "Home": backend.home,
     "Arabic Text Preprocessor": backend.processor,
     "Arabic Language Generation": backend.aragpt,
+    "Arabic Sentiment Analysis": backend.sa,
 }

backend/sa.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import streamlit as st
+from .services import SentimentAnalyzer
+from functools import lru_cache
+# @st.cache(allow_output_mutation=False, hash_funcs={Tokenizer: str})
+@lru_cache(maxsize=1)
+def load_text_generator():
+    predictor = SentimentAnalyzer()
+    return predictor
+predictor = load_text_generator()
+def write():
+    input_text = st.text_input("Enter your text here:", key="Fuck you")
+    if st.button("Predict"):
+        with st.spinner("Predicting..."):
+            prediction, score, all_score = predictor.predict([input_text])

backend/sa_utils.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import re
+from contextlib import contextmanager
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fuzzysearch import find_near_matches
+from pyarabic import araby
+from torch import nn
+from transformers import AutoTokenizer, BertModel, BertPreTrainedModel, pipeline
+from transformers.modeling_outputs import SequenceClassifierOutput
+from .preprocess import ArabertPreprocessor, url_regexes, user_mention_regex
+multiple_char_pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
+# ASAD-NEW_AraBERT_PREP-Balanced
+class NewArabicPreprocessorBalanced(ArabertPreprocessor):
+    def __init__(
+        self,
+        model_name: str,
+        keep_emojis: bool = False,
+        remove_html_markup: bool = True,
+        replace_urls_emails_mentions: bool = True,
+        strip_tashkeel: bool = True,
+        strip_tatweel: bool = True,
+        insert_white_spaces: bool = True,
+        remove_non_digit_repetition: bool = True,
+        replace_slash_with_dash: bool = None,
+        map_hindi_numbers_to_arabic: bool = None,
+        apply_farasa_segmentation: bool = None,
+    ):
+        if "UBC-NLP" in model_name or "CAMeL-Lab" in model_name:
+            keep_emojis = True
+            remove_non_digit_repetition = True
+        super().__init__(
+            model_name=model_name,
+            keep_emojis=keep_emojis,
+            remove_html_markup=remove_html_markup,
+            replace_urls_emails_mentions=replace_urls_emails_mentions,
+            strip_tashkeel=strip_tashkeel,
+            strip_tatweel=strip_tatweel,
+            insert_white_spaces=insert_white_spaces,
+            remove_non_digit_repetition=remove_non_digit_repetition,
+            replace_slash_with_dash=replace_slash_with_dash,
+            map_hindi_numbers_to_arabic=map_hindi_numbers_to_arabic,
+            apply_farasa_segmentation=apply_farasa_segmentation,
+        )
+        self.true_model_name = model_name
+    def preprocess(self, text):
+        if "UBC-NLP" in self.true_model_name:
+            return self.ubc_prep(text)
+    def ubc_prep(self, text):
+        text = re.sub("\s", " ", text)
+        text = text.replace("\\n", " ")
+        text = text.replace("\\r", " ")
+        text = araby.strip_tashkeel(text)
+        text = araby.strip_tatweel(text)
+        # replace all possible URLs
+        for reg in url_regexes:
+            text = re.sub(reg, " URL ", text)
+        text = re.sub("(URL\s*)+", " URL ", text)
+        # replace mentions with USER
+        text = re.sub(user_mention_regex, " USER ", text)
+        text = re.sub("(USER\s*)+", " USER ", text)
+        # replace hashtags with HASHTAG
+        # text = re.sub(r"#[\w\d]+", " HASH TAG ", text)
+        text = text.replace("#", " HASH ")
+        text = text.replace("_", " ")
+        text = " ".join(text.split())
+        # text = re.sub("\B\\[Uu]\w+", "", text)
+        text = text.replace("\\U0001f97a", "🥺")
+        text = text.replace("\\U0001f928", "🤨")
+        text = text.replace("\\U0001f9d8", "😀")
+        text = text.replace("\\U0001f975", "😥")
+        text = text.replace("\\U0001f92f", "😲")
+        text = text.replace("\\U0001f92d", "🤭")
+        text = text.replace("\\U0001f9d1", "😐")
+        text = text.replace("\\U000e0067", "")
+        text = text.replace("\\U000e006e", "")
+        text = text.replace("\\U0001f90d", "♥")
+        text = text.replace("\\U0001f973", "🎉")
+        text = text.replace("\\U0001fa79", "")
+        text = text.replace("\\U0001f92b", "🤐")
+        text = text.replace("\\U0001f9da", "🦋")
+        text = text.replace("\\U0001f90e", "♥")
+        text = text.replace("\\U0001f9d0", "🧐")
+        text = text.replace("\\U0001f9cf", "")
+        text = text.replace("\\U0001f92c", "😠")
+        text = text.replace("\\U0001f9f8", "😸")
+        text = text.replace("\\U0001f9b6", "💩")
+        text = text.replace("\\U0001f932", "🤲")
+        text = text.replace("\\U0001f9e1", "🧡")
+        text = text.replace("\\U0001f974", "☹")
+        text = text.replace("\\U0001f91f", "")
+        text = text.replace("\\U0001f9fb", "💩")
+        text = text.replace("\\U0001f92a", "🤪")
+        text = text.replace("\\U0001f9fc", "")
+        text = text.replace("\\U000e0065", "")
+        text = text.replace("\\U0001f92e", "💩")
+        text = text.replace("\\U000e007f", "")
+        text = text.replace("\\U0001f970", "🥰")
+        text = text.replace("\\U0001f929", "🤩")
+        text = text.replace("\\U0001f6f9", "")
+        text = text.replace("🤍", "♥")
+        text = text.replace("🦠", "😷")
+        text = text.replace("🤢", "مقرف")
+        text = text.replace("🤮", "مقرف")
+        text = text.replace("🕠", "⌚")
+        text = text.replace("🤬", "😠")
+        text = text.replace("🤧", "😷")
+        text = text.replace("🥳", "🎉")
+        text = text.replace("🥵", "🔥")
+        text = text.replace("🥴", "☹")
+        text = text.replace("🤫", "🤐")
+        text = text.replace("🤥", "كذاب")
+        text = text.replace("\\u200d", " ")
+        text = text.replace("u200d", " ")
+        text = text.replace("\\u200c", " ")
+        text = text.replace("u200c", " ")
+        text = text.replace('"', "'")
+        text = text.replace("\\xa0", "")
+        text = text.replace("\\u2066", " ")
+        text = re.sub("\B\\\[Uu]\w+", "", text)
+        text = super(NewArabicPreprocessorBalanced, self).preprocess(text)
+        text = " ".join(text.split())
+        return text
+"""CNNMarbertArabicPreprocessor"""
+# ASAD-CNN_MARBERT
+class CNNMarbertArabicPreprocessor(ArabertPreprocessor):
+    def __init__(
+        self,
+        model_name,
+        keep_emojis=False,
+        remove_html_markup=True,
+        replace_urls_emails_mentions=True,
+        remove_elongations=True,
+    ):
+        if "UBC-NLP" in model_name or "CAMeL-Lab" in model_name:
+            keep_emojis = True
+            remove_elongations = False
+        super().__init__(
+            model_name,
+            keep_emojis,
+            remove_html_markup,
+            replace_urls_emails_mentions,
+            remove_elongations,
+        )
+        self.true_model_name = model_name
+    def preprocess(self, text):
+        if "UBC-NLP" in self.true_model_name:
+            return self.ubc_prep(text)
+    def ubc_prep(self, text):
+        text = re.sub("\s", " ", text)
+        text = text.replace("\\n", " ")
+        text = araby.strip_tashkeel(text)
+        text = araby.strip_tatweel(text)
+        # replace all possible URLs
+        for reg in url_regexes:
+            text = re.sub(reg, " URL ", text)
+        text = re.sub("(URL\s*)+", " URL ", text)
+        # replace mentions with USER
+        text = re.sub(user_mention_regex, " USER ", text)
+        text = re.sub("(USER\s*)+", " USER ", text)
+        # replace hashtags with HASHTAG
+        # text = re.sub(r"#[\w\d]+", " HASH TAG ", text)
+        text = text.replace("#", " HASH ")
+        text = text.replace("_", " ")
+        text = " ".join(text.split())
+        text = super(CNNMarbertArabicPreprocessor, self).preprocess(text)
+        text = text.replace("\u200d", " ")
+        text = text.replace("u200d", " ")
+        text = text.replace("\u200c", " ")
+        text = text.replace("u200c", " ")
+        text = text.replace('"', "'")
+        # text = re.sub('[\d\.]+', ' NUM ', text)
+        # text = re.sub('(NUM\s*)+', ' NUM ', text)
+        text = multiple_char_pattern.sub(r"\1\1", text)
+        text = " ".join(text.split())
+        return text
+"""Trial5ArabicPreprocessor"""
+class Trial5ArabicPreprocessor(ArabertPreprocessor):
+    def __init__(
+        self,
+        model_name,
+        keep_emojis=False,
+        remove_html_markup=True,
+        replace_urls_emails_mentions=True,
+    ):
+        if "UBC-NLP" in model_name:
+            keep_emojis = True
+        super().__init__(
+            model_name, keep_emojis, remove_html_markup, replace_urls_emails_mentions
+        )
+        self.true_model_name = model_name
+    def preprocess(self, text):
+        if "UBC-NLP" in self.true_model_name:
+            return self.ubc_prep(text)
+    def ubc_prep(self, text):
+        text = re.sub("\s", " ", text)
+        text = text.replace("\\n", " ")
+        text = araby.strip_tashkeel(text)
+        text = araby.strip_tatweel(text)
+        # replace all possible URLs
+        for reg in url_regexes:
+            text = re.sub(reg, " URL ", text)
+        # replace mentions with USER
+        text = re.sub(user_mention_regex, " USER ", text)
+        # replace hashtags with HASHTAG
+        # text = re.sub(r"#[\w\d]+", " HASH TAG ", text)
+        text = text.replace("#", " HASH TAG ")
+        text = text.replace("_", " ")
+        text = " ".join(text.split())
+        text = super(Trial5ArabicPreprocessor, self).preprocess(text)
+        # text = text.replace("السلام عليكم"," ")
+        # text = text.replace(find_near_matches("السلام عليكم",text,max_deletions=3,max_l_dist=3)[0].matched," ")
+        return text
+"""SarcasmArabicPreprocessor"""
+class SarcasmArabicPreprocessor(ArabertPreprocessor):
+    def __init__(
+        self,
+        model_name,
+        keep_emojis=False,
+        remove_html_markup=True,
+        replace_urls_emails_mentions=True,
+    ):
+        if "UBC-NLP" in model_name:
+            keep_emojis = True
+        super().__init__(
+            model_name, keep_emojis, remove_html_markup, replace_urls_emails_mentions
+        )
+        self.true_model_name = model_name
+    def preprocess(self, text):
+        if "UBC-NLP" in self.true_model_name:
+            return self.ubc_prep(text)
+        else:
+            return super(SarcasmArabicPreprocessor, self).preprocess(text)
+    def ubc_prep(self, text):
+        text = re.sub("\s", " ", text)
+        text = text.replace("\\n", " ")
+        text = araby.strip_tashkeel(text)
+        text = araby.strip_tatweel(text)
+        # replace all possible URLs
+        for reg in url_regexes:
+            text = re.sub(reg, " URL ", text)
+        # replace mentions with USER
+        text = re.sub(user_mention_regex, " USER ", text)
+        # replace hashtags with HASHTAG
+        # text = re.sub(r"#[\w\d]+", " HASH TAG ", text)
+        text = text.replace("#", " HASH TAG ")
+        text = text.replace("_", " ")
+        text = text.replace('"', " ")
+        text = " ".join(text.split())
+        text = super(SarcasmArabicPreprocessor, self).preprocess(text)
+        return text
+"""NoAOAArabicPreprocessor"""
+class NoAOAArabicPreprocessor(ArabertPreprocessor):
+    def __init__(
+        self,
+        model_name,
+        keep_emojis=False,
+        remove_html_markup=True,
+        replace_urls_emails_mentions=True,
+    ):
+        if "UBC-NLP" in model_name:
+            keep_emojis = True
+        super().__init__(
+            model_name, keep_emojis, remove_html_markup, replace_urls_emails_mentions
+        )
+        self.true_model_name = model_name
+    def preprocess(self, text):
+        if "UBC-NLP" in self.true_model_name:
+            return self.ubc_prep(text)
+        else:
+            return super(NoAOAArabicPreprocessor, self).preprocess(text)
+    def ubc_prep(self, text):
+        text = re.sub("\s", " ", text)
+        text = text.replace("\\n", " ")
+        text = araby.strip_tashkeel(text)
+        text = araby.strip_tatweel(text)
+        # replace all possible URLs
+        for reg in url_regexes:
+            text = re.sub(reg, " URL ", text)
+        # replace mentions with USER
+        text = re.sub(user_mention_regex, " USER ", text)
+        # replace hashtags with HASHTAG
+        # text = re.sub(r"#[\w\d]+", " HASH TAG ", text)
+        text = text.replace("#", " HASH TAG ")
+        text = text.replace("_", " ")
+        text = " ".join(text.split())
+        text = super(NoAOAArabicPreprocessor, self).preprocess(text)
+        text = text.replace("السلام عليكم", " ")
+        text = text.replace("ورحمة الله وبركاته", " ")
+        matched = find_near_matches("السلام عليكم", text, max_deletions=3, max_l_dist=3)
+        if len(matched) > 0:
+            text = text.replace(matched[0].matched, " ")
+        matched = find_near_matches(
+            "ورحمة الله وبركاته", text, max_deletions=3, max_l_dist=3
+        )
+        if len(matched) > 0:
+            text = text.replace(matched[0].matched, " ")
+        return text
+class CnnBertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BertModel(config)
+        filter_sizes = [1, 2, 3, 4, 5]
+        num_filters = 32
+        self.convs1 = nn.ModuleList(
+            [nn.Conv2d(4, num_filters, (K, config.hidden_size)) for K in filter_sizes]
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(len(filter_sizes) * num_filters, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        x = outputs[2][-4:]
+        x = torch.stack(x, dim=1)
+        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
+        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
+        x = torch.cat(x, 1)
+        x = self.dropout(x)
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=None,
+            attentions=outputs.attentions,
+        )
+class CNNTextClassificationPipeline:
+    def __init__(self, model_path, device, return_all_scores=False):
+        self.model_path = model_path
+        self.model = CnnBertForSequenceClassification.from_pretrained(self.model_path)
+        # Special handling
+        self.device = torch.device("cpu" if device < 0 else f"cuda:{device}")
+        if self.device.type == "cuda":
+            self.model = self.model.to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.return_all_scores = return_all_scores
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+        Returns:
+            Context manager
+        Examples::
+            # Explicitly ask for tensor allocation on CUDA device :0
+            pipe = pipeline(..., device=0)
+            with pipe.device_placement():
+                # Every framework specific tensor allocation will be done on the request device
+                output = pipe(...)
+        """
+        if self.device.type == "cuda":
+            torch.cuda.set_device(self.device)
+        yield
+    def ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+        Args:
+            inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`.
+        Return:
+            :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device.
+        """
+        return {
+            name: tensor.to(self.device) if isinstance(tensor, torch.Tensor) else tensor
+            for name, tensor in inputs.items()
+        }
+    def __call__(self, text):
+        """
+        Classify the text(s) given as inputs.
+        Args:
+            args (:obj:`str` or :obj:`List[str]`):
+                One or several texts (or one list of prompts) to classify.
+        Return:
+            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
+            - **label** (:obj:`str`) -- The label predicted.
+            - **score** (:obj:`float`) -- The corresponding probability.
+            If ``self.return_all_scores=True``, one such dictionary is returned per label.
+        """
+        # outputs = super().__call__(*args, **kwargs)
+        inputs = self.tokenizer.batch_encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=64,
+            padding=True,
+            truncation="longest_first",
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            inputs = self.ensure_tensor_on_device(**inputs)
+            predictions = self.model(**inputs)[0].cpu()
+        predictions = predictions.numpy()
+        if self.model.config.num_labels == 1:
+            scores = 1.0 / (1.0 + np.exp(-predictions))
+        else:
+            scores = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
+        if self.return_all_scores:
+            return [
+                [
+                    {"label": self.model.config.id2label[i], "score": score.item()}
+                    for i, score in enumerate(item)
+                ]
+                for item in scores
+            ]
+        else:
+            return [
+                {"label": self.inv_label_map[item.argmax()], "score": item.max().item()}
+                for item in scores
+            ]

backend/services.py CHANGED Viewed

@@ -1,9 +1,17 @@
 import json
 import os
 import requests
 from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, set_seed
 from .modeling_gpt2 import GPT2LMHeadModel as GROVERLMHeadModel
 from .preprocess import ArabertPreprocessor
 # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
 class TextGeneration:
@@ -170,3 +178,172 @@ class TextGeneration:
             },
         }
         return self.query(payload, model_name)

 import json
 import os
+from typing import List
+import more_itertools
+import pandas as pd
 import requests
+from tqdm.auto import tqdm
 from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, set_seed
 from .modeling_gpt2 import GPT2LMHeadModel as GROVERLMHeadModel
 from .preprocess import ArabertPreprocessor
+from .sa_utils import *
+from .utils import download_models
 # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
 class TextGeneration:
             },
         }
         return self.query(payload, model_name)
+class SentimentAnalyzer:
+    def __init__(self):
+        self.sa_models = [
+            "sa_trial5_1",
+            "sa_no_aoa_in_neutral",
+            "sa_cnnbert",
+            "sa_sarcasm",
+            "sar_trial10",
+            "sa_no_AOA",
+        ]
+        self.model_repos = download_models(self.sa_models)
+        # fmt: off
+        self.processors = {
+            "sa_trial5_1": Trial5ArabicPreprocessor(model_name='UBC-NLP/MARBERT'),
+            "sa_no_aoa_in_neutral": NewArabicPreprocessorBalanced(model_name='UBC-NLP/MARBERT'),
+            "sa_cnnbert": CNNMarbertArabicPreprocessor(model_name='UBC-NLP/MARBERT'),
+            "sa_sarcasm": SarcasmArabicPreprocessor(model_name='UBC-NLP/MARBERT'),
+            "sar_trial10": SarcasmArabicPreprocessor(model_name='UBC-NLP/MARBERT'),
+            "sa_no_AOA": NewArabicPreprocessorBalanced(model_name='UBC-NLP/MARBERT'),
+        }
+        self.pipelines = {
+            "sa_trial5_1": [pipeline("sentiment-analysis", model="{}/train_{}/best_model".format(self.model_repos["sa_trial5_1"],i), device=-1,return_all_scores =True) for i in range(0,5)],
+            "sa_no_aoa_in_neutral": [pipeline("sentiment-analysis", model="{}/train_{}/best_model".format(self.model_repos["sa_no_aoa_in_neutral"],i), device=-1,return_all_scores =True) for i in range(0,5)],
+            "sa_cnnbert": [CNNTextClassificationPipeline("{}/train_{}/best_model".format(self.model_repos["sa_cnnbert"],i), device=-1, return_all_scores =True) for i in range(0,5)],
+            "sa_sarcasm": [pipeline("sentiment-analysis", model="{}/train_{}/best_model".format(self.model_repos["sa_sarcasm"],i), device=-1,return_all_scores =True) for i in range(0,5)],
+            "sar_trial10": [pipeline("sentiment-analysis", model="{}/train_{}/best_model".format(self.model_repos["sar_trial10"],i), device=-1,return_all_scores =True) for i in range(0,5)],
+            "sa_no_AOA": [pipeline("sentiment-analysis", model="{}/train_{}/best_model".format(self.model_repos["sa_no_aoa_in_neutral"],i), device=-1,return_all_scores =True) for i in range(0,5)],
+        }
+        # fmt: on
+    def get_sarcasm_label(self, texts):
+        prep = self.processors["sar_trial10"]
+        prep_texts = [prep.preprocess(x) for x in texts]
+        preds_df = pd.DataFrame([])
+        for i in range(0, 5):
+            preds = []
+            for s in tqdm(more_itertools.chunked(list(prep_texts), 128)):
+                preds.extend(self.pipelines["sar_trial10"][i](s))
+            preds_df[f"model_{i}"] = preds
+        final_labels = []
+        final_scores = []
+        for id, row in preds_df.iterrows():
+            pos_total = 0
+            neu_total = 0
+            for pred in row[:]:
+                pos_total += pred[0]["score"]
+                neu_total += pred[1]["score"]
+            pos_avg = pos_total / len(row[:])
+            neu_avg = neu_total / len(row[:])
+            final_labels.append(
+                self.pipelines["sar_trial10"][0].model.config.id2label[
+                    np.argmax([pos_avg, neu_avg])
+                ]
+            )
+            final_scores.append(np.max([pos_avg, neu_avg]))
+        return final_labels, final_scores
+    def get_preds_from_a_model(self, texts: List[str], model_name):
+        prep = self.processors[model_name]
+        prep_texts = [prep.preprocess(x) for x in texts]
+        if model_name == "sa_sarcasm":
+            sarcasm_label, _ = self.get_preds_from_sarcasm(texts, "sar_trial10")
+            sarcastic_map = {"Not_Sarcastic": "غير ساخر", "Sarcastic": "ساخر"}
+            labeled_prep_texts = []
+            for t, l in zip(prep_texts, sarcasm_label):
+                labeled_prep_texts.append(sarcastic_map[l] + " [SEP] " + t)
+        preds_df = pd.DataFrame([])
+        for i in range(0, 5):
+            preds = []
+            for s in tqdm(more_itertools.chunked(list(prep_texts), 128)):
+                preds.extend(self.pipelines[model_name][i](s))
+            preds_df[f"model_{i}"] = preds
+        final_labels = []
+        final_scores = []
+        final_scores_list = []
+        for id, row in preds_df.iterrows():
+            pos_total = 0
+            neg_total = 0
+            neu_total = 0
+            for pred in row[2:]:
+                pos_total += pred[0]["score"]
+                neu_total += pred[1]["score"]
+                neg_total += pred[2]["score"]
+            pos_avg = pos_total / 5
+            neu_avg = neu_total / 5
+            neg_avg = neg_total / 5
+            if model_name == "sa_no_aoa_in_neutral":
+                final_labels.append(
+                    self.pipelines[model_name][0].model.config.id2label[
+                        np.argmax([neu_avg, neg_avg, pos_avg])
+                    ]
+                )
+            else:
+                final_labels.append(
+                    self.pipelines[model_name][0].model.config.id2label[
+                        np.argmax([pos_avg, neu_avg, neg_avg])
+                    ]
+                )
+            final_scores.append(np.max([pos_avg, neu_avg, neg_avg]))
+            final_scores_list.append((pos_avg, neu_avg, neg_avg))
+        return final_labels, final_scores, final_scores_list
+    def predict(self, texts: List[str]):
+        (
+            new_balanced_label,
+            new_balanced_score,
+            new_balanced_score_list,
+        ) = self.get_preds_from_a_model(texts, "sa_no_aoa_in_neutral")
+        (
+            cnn_marbert_label,
+            cnn_marbert_score,
+            cnn_marbert_score_list,
+        ) = self.get_preds_from_a_model(texts, "sa_cnnbert")
+        trial5_label, trial5_score, trial5_score_list = self.get_preds_from_a_model(
+            texts, "sa_trial5_1"
+        )
+        no_aoa_label, no_aoa_score, no_aoa_score_list = self.get_preds_from_a_model(
+            texts, "sa_no_AOA"
+        )
+        sarcasm_label, sarcasm_score, sarcasm_score_list = self.get_preds_from_a_model(
+            texts, "sa_sarcasm"
+        )
+        id_label_map = {0: "Positive", 1: "Neutral", 2: "Negative"}
+        final_ensemble_prediction = []
+        final_ensemble_score = []
+        final_ensemble_all_score = []
+        for entry in zip(
+            new_balanced_score_list,
+            cnn_marbert_score_list,
+            trial5_score_list,
+            no_aoa_score_list,
+            sarcasm_score_list,
+        ):
+            pos_score = 0
+            neu_score = 0
+            neg_score = 0
+            for s in entry:
+                pos_score += s[0] * 1.57
+                neu_score += s[1] * 0.98
+                neg_score += s[2] * 0.93
+                # weighted 2
+                # pos_score += s[0]*1.67
+                # neu_score += s[1]
+                # neg_score += s[2]*0.95
+            final_ensemble_prediction.append(
+                id_label_map[np.argmax([pos_score, neu_score, neg_score])]
+            )
+            final_ensemble_score.append(np.max([pos_score, neu_score, neg_score]))
+            final_ensemble_all_score.append((pos_score, neu_score, neg_score))
+        return final_ensemble_prediction, final_ensemble_score, final_ensemble_all_score

backend/utils.py CHANGED Viewed

@@ -1,6 +1,16 @@
 import psutil
 def get_current_ram_usage():
     ram = psutil.virtual_memory()
     return ram.available / 1024 / 1024 / 1024, ram.total / 1024 / 1024 / 1024

 import psutil
+from huggingface_hub import Repository
 def get_current_ram_usage():
     ram = psutil.virtual_memory()
     return ram.available / 1024 / 1024 / 1024, ram.total / 1024 / 1024 / 1024
+def download_models(models):
+    model_dirs = {}
+    for model in models:
+        model_dirs[model] = Repository(
+            model, clone_from=f"https://huggingface.co/researchaccount/{model}"
+        )
+    return model_dirs

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ emoji==1.4.2
 awesome_streamlit
 torch==1.9.0
 transformers==4.10.0
-psutil==5.8.0

 awesome_streamlit
 torch==1.9.0
 transformers==4.10.0
+psutil==5.8.0
+fuzzysearch==0.7.3
+more-itertools==8.9.0