Aman72321 commited on
Commit
3488e32
1 Parent(s): b935e08

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import gc
3
+ from collections import defaultdict
4
+ import torch
5
+ from transformers import pipeline
6
+ from lingua import Language, LanguageDetectorBuilder
7
+
8
+ default_models = {
9
+ Language.ENGLISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
10
+ Language.JAPANESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
11
+ Language.ARABIC: "Ammar-alhaj-ali/arabic-MARBERT-sentiment",
12
+ Language.GERMAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
13
+ Language.SPANISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
14
+ Language.FRENCH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
15
+ Language.CHINESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
16
+ Language.INDONESIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
17
+ Language.HINDI: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
18
+ Language.ITALIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
19
+ Language.MALAY: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
20
+ Language.PORTUGUESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
21
+ Language.SWEDISH: "KBLab/robust-swedish-sentiment-multiclass",
22
+ Language.FINNISH: "fergusq/finbert-finnsentiment",
23
+ }
24
+ language_detector = LanguageDetectorBuilder.from_all_languages().build()
25
+
26
+
27
+ def split_message(message, max_length):
28
+ """ Split a message into a list of chunks of given maximum size. """
29
+ return [message[i: i + max_length] for i in range(0, len(message), max_length)]
30
+
31
+
32
+ def process_messages_in_batches(messages_with_languages, models=None, max_length=512):
33
+ """
34
+ Process messages in batches, creating only one pipeline at a time, and maintain the original order.
35
+ Params:
36
+ messages_with_languages: list of tuples, each containing a message and its detected language
37
+ models: dict, model paths indexed by Language
38
+ Returns:
39
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
40
+ """
41
+
42
+ if models is None:
43
+ models = default_models
44
+ else:
45
+ models = default_models.copy().update(models)
46
+
47
+ results = {}
48
+
49
+ # Group messages by model, preserving original order.
50
+ # If language is no detected or a model for that language is not
51
+ # provided, add None to results
52
+ messages_by_model = defaultdict(list)
53
+ for index, (message, language) in enumerate(messages_with_languages):
54
+ model_name = models.get(language)
55
+ if model_name:
56
+ messages_by_model[model_name].append((index, message))
57
+ else:
58
+ results[index] = {"label": "none", "score": 0}
59
+
60
+ # Process messages and maintain original order
61
+ for model_name, batch in messages_by_model.items():
62
+ sentiment_pipeline = pipeline(model=model_name)
63
+
64
+ chunks = []
65
+ message_map = {}
66
+ for idx, message in batch:
67
+ message_chunks = split_message(message, max_length)
68
+ for chunk in message_chunks:
69
+ chunks.append(chunk)
70
+ if idx in message_map:
71
+ message_map[idx].append(len(chunks) - 1)
72
+ else:
73
+ message_map[idx] = [len(chunks) - 1]
74
+
75
+ chunk_sentiments = sentiment_pipeline(chunks)
76
+
77
+ for idx, chunk_indices in message_map.items():
78
+ sum_scores = {"neutral": 0}
79
+ for chunk_idx in chunk_indices:
80
+ label = chunk_sentiments[chunk_idx]["label"]
81
+ score = chunk_sentiments[chunk_idx]["score"]
82
+ if label in sum_scores:
83
+ sum_scores[label] += score
84
+ else:
85
+ sum_scores[label] = score
86
+ best_sentiment = max(sum_scores, key=sum_scores.get)
87
+ score = sum_scores[best_sentiment] / len(chunk_indices)
88
+ results[idx] = {"label": best_sentiment, "score": score}
89
+
90
+ # Force garbage collections to remove the model from memory
91
+ del sentiment_pipeline
92
+ gc.collect()
93
+
94
+ # Unify common spellings of the labels
95
+ for i in range(len(results)):
96
+ results[i]["label"] = results[i]["label"].lower()
97
+
98
+ results = [results[i] for i in range(len(results))]
99
+
100
+ return results
101
+
102
+
103
+ def sentiment(messages, models=None):
104
+ """
105
+ Estimate the sentiment of a list of messages (strings of text). The
106
+ sentences may be in different languages from each other.
107
+ We maintain a list of default models for some languages. In addition,
108
+ the user can provide a model for a given language in the models
109
+ dictionary. The keys for this dictionary are lingua.Language objects
110
+ and items HuggingFace model paths.
111
+ Params:
112
+ messages: list of message strings
113
+ models: dict, huggingface model paths indexed by lingua.Language
114
+ Returns:
115
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
116
+ """
117
+ messages_with_languages = [
118
+ (message, language_detector.detect_language_of(message)) for message in messages
119
+ ]
120
+
121
+ results = process_messages_in_batches(messages_with_languages, models)
122
+ return results
123
+
124
+
125
+ def main(input_text):
126
+ messages = [message.strip() for message in input_text.split('\n') if message.strip()]
127
+ results = sentiment(messages)
128
+ output = []
129
+ for idx, result in enumerate(results):
130
+ message = messages[idx]
131
+ sentiment_label = result["label"]
132
+ sentiment_score = result["score"]
133
+ output.append((message, sentiment_label, sentiment_score))
134
+ return output
135
+
136
+
137
+ iface = gr.Interface(
138
+ fn=main,
139
+ inputs="text",
140
+ outputs=[gr.outputs.Table(headings=["Message", "Sentiment", "Score"])],
141
+ title="Sentiment Analysis Pipeline",
142
+ description="Enter your messages (one per line) and get sentiment analysis results.",
143
+ )
144
+ iface.launch()