Aman72321 commited on
Commit
58937e8
1 Parent(s): 579ae41

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sentiment analysis pipeline for texts in multiple languages.
3
+ """
4
+
5
+ import gc
6
+ from collections import defaultdict
7
+ import lingua
8
+ from transformers import pipeline
9
+ import torch
10
+ from lingua import Language, LanguageDetectorBuilder
11
+
12
+
13
+ __version__ = "0.1.0"
14
+
15
+ if torch.cuda.is_available():
16
+ device_tag = 0 # first gpu
17
+ else:
18
+ device_tag = -1 # cpu
19
+
20
+
21
+ default_models = {
22
+ Language.ENGLISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
23
+ Language.JAPANESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
24
+ Language.ARABIC: "Ammar-alhaj-ali/arabic-MARBERT-sentiment",
25
+ Language.GERMAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
26
+ Language.SPANISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
27
+ Language.FRENCH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
28
+ Language.CHINESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
29
+ Language.INDONESIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
30
+ Language.HINDI: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
31
+ Language.ITALIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
32
+ Language.MALAY: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
33
+ Language.PORTUGUESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
34
+ Language.SWEDISH: "KBLab/robust-swedish-sentiment-multiclass",
35
+ Language.FINNISH: "fergusq/finbert-finnsentiment",
36
+ }
37
+ language_detector = LanguageDetectorBuilder.from_all_languages().build()
38
+
39
+
40
+
41
+ # Processing a batch:
42
+ # Detect languages into a list and map to models
43
+ # For each model, make a pipeline, make a list and process
44
+ # inject int a list in the original order
45
+
46
+ def split_message(message, max_length):
47
+ """ Split a message into a list of chunks of given maximum size. """
48
+ return [message[i: i+max_length] for i in range(0, len(message), max_length)]
49
+
50
+
51
+ def process_messages_in_batches(
52
+ messages_with_languages,
53
+ models = None,
54
+ max_length = 512
55
+ ):
56
+ """
57
+ Process messages in batches, creating only one pipeline at a time, and maintain the original order.
58
+
59
+ Params:
60
+ messages_with_languages: list of tuples, each containing a message and its detected language
61
+ models: dict, model paths indexed by Language
62
+
63
+ Returns:
64
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
65
+ """
66
+
67
+ if models is None:
68
+ models = default_models
69
+ else:
70
+ models = default_models.copy().update(models)
71
+
72
+ results = {}
73
+
74
+ # Group messages by model, preserving original order.
75
+ # If language is no detected or a model for that language is not
76
+ # provided, add None to results
77
+ messages_by_model = defaultdict(list)
78
+ for index, (message, language) in enumerate(messages_with_languages):
79
+ model_name = models.get(language)
80
+ if model_name:
81
+ messages_by_model[model_name].append((index, message))
82
+ else:
83
+ results[index] = {"label": "none", "score": 0}
84
+
85
+ # Process messages and maintain original order
86
+ for model_name, batch in messages_by_model.items():
87
+ sentiment_pipeline = pipeline(model=model_name, device=device_tag)
88
+
89
+ chunks = []
90
+ message_map = {}
91
+ for idx, message in batch:
92
+ message_chunks = split_message(message, max_length)
93
+ for chunk in message_chunks:
94
+ chunks.append(chunk)
95
+ if idx in message_map:
96
+ message_map[idx].append(len(chunks) - 1)
97
+ else:
98
+ message_map[idx] = [len(chunks) - 1]
99
+
100
+ chunk_sentiments = sentiment_pipeline(chunks)
101
+
102
+ for idx, chunk_indices in message_map.items():
103
+ sum_scores = {"neutral": 0}
104
+ for chunk_idx in chunk_indices:
105
+ label = chunk_sentiments[chunk_idx]["label"]
106
+ score = chunk_sentiments[chunk_idx]["score"]
107
+ if label in sum_scores:
108
+ sum_scores[label] += score
109
+ else:
110
+ sum_scores[label] = score
111
+ best_sentiment = max(sum_scores, key=sum_scores.get)
112
+ score = sum_scores[best_sentiment] / len(chunk_indices)
113
+ results[idx] = {"label": best_sentiment, "score": score}
114
+
115
+ # Force garbage collections to remove the model from memory
116
+ del sentiment_pipeline
117
+ gc.collect()
118
+
119
+ # Unify common spellings of the labels
120
+ for i in range(len(results)):
121
+ results[i]["label"] = results[i]["label"].lower()
122
+
123
+ results = [results[i] for i in range(len(results))]
124
+
125
+ return results
126
+
127
+
128
+ def sentiment(messages, models=None):
129
+ """
130
+ Estimate the sentiment of a list of messages (strings of text). The
131
+ sentences may be in different languages from each other.
132
+
133
+ We maintain a list of default models for some languages. In addition,
134
+ the user can provide a model for a given language in the models
135
+ dictionary. The keys for this dictionary are lingua.Language objects
136
+ and items HuggingFace model paths.
137
+
138
+ Params:
139
+ messages: list of message strings
140
+ models: dict, huggingface model paths indexed by lingua.Language
141
+
142
+ Returns:
143
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
144
+ """
145
+ messages_with_languages = [
146
+ (message, language_detector.detect_language_of(message)) for message in messages
147
+ ]
148
+
149
+ results = process_messages_in_batches(messages_with_languages, models)
150
+ return results
151
+
152
+
153
+ messages = [
154
+ "I love this product! It's amazing!",
155
+ "This movie was terrible. I regret watching it.",
156
+ "今日はいい天気ですね。",
157
+ "Je suis très content de votre service.",
158
+ "Este restaurante tiene una comida deliciosa.",
159
+ "خدمة المطعم كانت محبطة",
160
+ "أنا سعيد"
161
+ # Add more messages as needed
162
+ ]
163
+ results = sentiment(messages)
164
+
165
+ # Analyze results
166
+ for idx, result in enumerate(results):
167
+ message = messages[idx]
168
+ sentiment_label = result["label"]
169
+ sentiment_score = result["score"]
170
+ print(f"Message: {message}")
171
+ print(f"Sentiment: {sentiment_label} (Score: {sentiment_score})")
172
+ print()