Aman72321 commited on
Commit
438cb84
1 Parent(s): 6ee22d1

Creates app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import gc
3
+ from collections import defaultdict
4
+ import torch
5
+ from transformers import pipeline
6
+ from lingua import Language, LanguageDetectorBuilder
7
+
8
+ __version__ = "0.1.0"
9
+
10
+ if torch.cuda.is_available():
11
+ device_tag = 0 # first gpu
12
+ else:
13
+ device_tag = -1 # cpu
14
+
15
+ default_models = {
16
+ Language.ENGLISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
17
+ Language.JAPANESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
18
+ Language.ARABIC: "Ammar-alhaj-ali/arabic-MARBERT-sentiment",
19
+ Language.GERMAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
20
+ Language.SPANISH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
21
+ Language.FRENCH: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
22
+ Language.CHINESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
23
+ Language.INDONESIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
24
+ Language.HINDI: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
25
+ Language.ITALIAN: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
26
+ Language.MALAY: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
27
+ Language.PORTUGUESE: "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
28
+ Language.SWEDISH: "KBLab/robust-swedish-sentiment-multiclass",
29
+ Language.FINNISH: "fergusq/finbert-finnsentiment",
30
+ }
31
+ language_detector = LanguageDetectorBuilder.from_all_languages().build()
32
+
33
+
34
+ def split_message(message, max_length):
35
+ """ Split a message into a list of chunks of given maximum size. """
36
+ return [message[i: i + max_length] for i in range(0, len(message), max_length)]
37
+
38
+
39
+ def process_messages_in_batches(messages_with_languages, models=None, max_length=512):
40
+ """
41
+ Process messages in batches, creating only one pipeline at a time, and maintain the original order.
42
+ Params:
43
+ messages_with_languages: list of tuples, each containing a message and its detected language
44
+ models: dict, model paths indexed by Language
45
+ Returns:
46
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
47
+ """
48
+
49
+ if models is None:
50
+ models = default_models
51
+ else:
52
+ models = default_models.copy().update(models)
53
+
54
+ results = {}
55
+
56
+ # Group messages by model, preserving original order.
57
+ # If language is no detected or a model for that language is not
58
+ # provided, add None to results
59
+ messages_by_model = defaultdict(list)
60
+ for index, (message, language) in enumerate(messages_with_languages):
61
+ model_name = models.get(language)
62
+ if model_name:
63
+ messages_by_model[model_name].append((index, message))
64
+ else:
65
+ results[index] = {"label": "none", "score": 0}
66
+
67
+ # Process messages and maintain original order
68
+ for model_name, batch in messages_by_model.items():
69
+ sentiment_pipeline = pipeline(model=model_name, device=device_tag)
70
+
71
+ chunks = []
72
+ message_map = {}
73
+ for idx, message in batch:
74
+ message_chunks = split_message(message, max_length)
75
+ for chunk in message_chunks:
76
+ chunks.append(chunk)
77
+ if idx in message_map:
78
+ message_map[idx].append(len(chunks) - 1)
79
+ else:
80
+ message_map[idx] = [len(chunks) - 1]
81
+
82
+ chunk_sentiments = sentiment_pipeline(chunks)
83
+
84
+ for idx, chunk_indices in message_map.items():
85
+ sum_scores = {"neutral": 0}
86
+ for chunk_idx in chunk_indices:
87
+ label = chunk_sentiments[chunk_idx]["label"]
88
+ score = chunk_sentiments[chunk_idx]["score"]
89
+ if label in sum_scores:
90
+ sum_scores[label] += score
91
+ else:
92
+ sum_scores[label] = score
93
+ best_sentiment = max(sum_scores, key=sum_scores.get)
94
+ score = sum_scores[best_sentiment] / len(chunk_indices)
95
+ results[idx] = {"label": best_sentiment, "score": score}
96
+
97
+ # Force garbage collections to remove the model from memory
98
+ del sentiment_pipeline
99
+ gc.collect()
100
+
101
+ # Unify common spellings of the labels
102
+ for i in range(len(results)):
103
+ results[i]["label"] = results[i]["label"].lower()
104
+
105
+ results = [results[i] for i in range(len(results))]
106
+
107
+ return results
108
+
109
+
110
+ def sentiment(messages, models=None):
111
+ """
112
+ Estimate the sentiment of a list of messages (strings of text). The
113
+ sentences may be in different languages from each other.
114
+ We maintain a list of default models for some languages. In addition,
115
+ the user can provide a model for a given language in the models
116
+ dictionary. The keys for this dictionary are lingua.Language objects
117
+ and items HuggingFace model paths.
118
+ Params:
119
+ messages: list of message strings
120
+ models: dict, huggingface model paths indexed by lingua.Language
121
+ Returns:
122
+ OrderedDict: containing the index as keys and tuple of (message, sentiment result) as values
123
+ """
124
+ messages_with_languages = [
125
+ (message, language_detector.detect_language_of(message)) for message in messages
126
+ ]
127
+
128
+ results = process_messages_in_batches(messages_with_languages, models)
129
+ return results
130
+
131
+
132
+ def main():
133
+ st.title("Sentiment Analysis Pipeline")
134
+ messages_input = st.text_area("Enter your messages (one per line):", height=200)
135
+ messages = [message.strip() for message in messages_input.split('\n') if message.strip()]
136
+
137
+ if st.button("Analyze Sentiments"):
138
+ results = sentiment(messages)
139
+ st.write("## Results:")
140
+ for idx, result in enumerate(results):
141
+ message = messages[idx]
142
+ sentiment_label = result["label"]
143
+ sentiment_score = result["score"]
144
+ st.write(f"**Message:** {message}")
145
+ st.write(f"**Sentiment:** {sentiment_label.capitalize()} (Score: {sentiment_score:.2f})")
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()