Lsthf vdmbrsv commited on
Commit
fc3a84d
·
0 Parent(s):

Duplicate from tabularisai/multilingual-emotion-classification

Browse files

Co-authored-by: Vadim Borisov <vdmbrsv@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +36 -0
  2. README.md +237 -0
  3. config.json +57 -0
  4. model.safetensors +3 -0
  5. tokenizer.json +3 -0
  6. tokenizer_config.json +14 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: FacebookAI/xlm-roberta-base
3
+ language:
4
+ - en
5
+ - zh
6
+ - es
7
+ - hi
8
+ - ar
9
+ - bn
10
+ - pt
11
+ - ru
12
+ - ja
13
+ - de
14
+ - id
15
+ - ta
16
+ - vi
17
+ - ko
18
+ - fr
19
+ - tr
20
+ - it
21
+ - pl
22
+ - uk
23
+ - ur
24
+ - nl
25
+ - pa
26
+ - sw
27
+ library_name: transformers
28
+ license: cc-by-nc-4.0
29
+ pipeline_tag: text-classification
30
+ tags:
31
+ - text-classification
32
+ - emotion-classification
33
+ - emotion
34
+ - multi-label-classification
35
+ - synthetic data
36
+ - social-media-analysis
37
+ - customer-feedback
38
+ - product-reviews
39
+ - brand-monitoring
40
+ - multilingual
41
+ - 🇪🇺
42
+ - region:eu
43
+ - synthetic
44
+ ---
45
+
46
+ > [!TIP]
47
+ > If you wish to use this model for commercial purposes, please obtain a license by contacting: info@tabularis.ai
48
+
49
+
50
+ # 🎭 Multilingual Emotion Classification Model (23 Languages, 11 Emotions)
51
+
52
+ [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord%20button.png" width="200"/>](https://discord.gg/sznxwdqBXj)
53
+
54
+
55
+ ## Model Details
56
+ - `Model Name:` tabularisai/multilingual-emotion-classification
57
+ - `Base Model:` FacebookAI/xlm-roberta-base
58
+ - `Task:` Multi-label Text Classification (Emotion Recognition)
59
+ - `Languages:` 23 — English, Mandarin Chinese (中文), Spanish (Español), Hindi (हिन्दी), Arabic (العربية), Bengali (বাংলা), Portuguese (Português), Russian (Русский), Japanese (日本語), German (Deutsch), Indonesian (Bahasa Indonesia), Tamil (தமிழ்), Vietnamese (Tiếng Việt), Korean (한국어), French (Français), Turkish (Türkçe), Italian (Italiano), Polish (Polski), Ukrainian (Українська), Urdu (اردو), Dutch (Nederlands), Punjabi (ਪੰਜਾਬੀ), and Swahili.
60
+ - `Number of Classes:` 11 — *anger, contempt, disgust, fear, frustration, gratitude, joy, love, neutral, sadness, surprise*
61
+ - `Label Mode:` Multi-label — each text can be assigned **zero, one, or multiple** emotions (independent sigmoid heads, τ = 0.5).
62
+ - `Usage:`
63
+ - Social media emotion analysis
64
+ - Customer feedback analysis
65
+ - Product review emotion tagging
66
+ - Brand monitoring
67
+ - Conversational AI / chatbot affect tracking
68
+ - Market research
69
+ - Customer service optimization
70
+
71
+ >
72
+
73
+
74
+ ## Model Description
75
+
76
+ This model is a fine-tuned version of `FacebookAI/xlm-roberta-base` for multilingual multi-label emotion classification. It was trained on synthetic multilingual data covering 23 languages and 11 emotion categories, enabling robust emotion detection across languages, registers, and cultural contexts.
77
+
78
+ Unlike single-label sentiment classifiers, this model predicts a **set** of emotions per input — reflecting the reality that utterances often carry mixed affect (e.g. *gratitude + love*, *frustration + sadness*).
79
+
80
+ ### Training Data
81
+
82
+ Trained on synthetic multilingual data generated by advanced LLMs, providing broad coverage of emotion expressions across all 23 supported languages. All labels are multi-hot vectors over the 11 emotion classes.
83
+
84
+ ### Training Procedure
85
+
86
+ - Fine-tuned for 3 epochs with **BCEWithLogitsLoss** (independent-binary-per-label).
87
+ - Cosine LR schedule with 6% warmup, `lr=2e-5`, effective batch size 64.
88
+ - Mixed precision (bf16) on a single A100; max sequence length 192.
89
+ - Per-epoch checkpointing; epoch 3 was selected by validation F1-micro.
90
+
91
+ ## Evaluation (held-out multilingual test set, 11,500 rows)
92
+
93
+ | Metric | Value |
94
+ |---------------------|-------|
95
+ | F1 (micro) | 0.840 |
96
+ | F1 (macro) | 0.839 |
97
+ | Jaccard (samples) | 0.794 |
98
+ | Subset accuracy | 0.640 |
99
+ | Hamming accuracy | 0.953 |
100
+ | AUROC (micro) | 0.980 |
101
+ | Average Precision (micro) | 0.923 |
102
+ | LRAP | 0.936 |
103
+
104
+ Decision threshold: τ = 0.5 applied independently per label.
105
+
106
+ ## Intended Use
107
+
108
+ Ideal for:
109
+ - Multilingual social media emotion monitoring
110
+ - International customer feedback affect analysis
111
+ - Global product review emotion tagging
112
+ - Worldwide brand sentiment & emotion tracking
113
+ - Affect-aware conversational systems
114
+
115
+ ## How to Use
116
+
117
+ ```python
118
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
119
+ import torch
120
+
121
+ model_name = "tabularisai/multilingual-emotion-classification"
122
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
123
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
124
+ model.eval()
125
+
126
+ LABELS = ["anger", "contempt", "disgust", "fear", "frustration",
127
+ "gratitude", "joy", "love", "neutral", "sadness", "surprise"]
128
+
129
+ @torch.no_grad()
130
+ def predict_emotions(texts, threshold: float = 0.5):
131
+ inputs = tokenizer(texts, return_tensors="pt", truncation=True,
132
+ padding=True, max_length=192)
133
+ probs = torch.sigmoid(model(**inputs).logits).cpu().numpy()
134
+ results = []
135
+ for row in probs:
136
+ picked = [(LABELS[i], float(row[i])) for i in range(len(LABELS)) if row[i] >= threshold]
137
+ picked.sort(key=lambda x: -x[1])
138
+ results.append(picked or [("neutral", float(row[LABELS.index("neutral")]))])
139
+ return results
140
+
141
+
142
+ texts = [
143
+ # English
144
+ "Thank you so much for helping me, I really appreciate it!",
145
+ "I can't believe they cancelled the flight again, this is ridiculous.",
146
+ # Spanish
147
+ "¡Qué alegría verte después de tanto tiempo!",
148
+ "Estoy muy decepcionado con el servicio.",
149
+ # Chinese
150
+ "收到你的礼物我真的很感动,谢谢你!",
151
+ "这部电影太吓人了,我都不敢一个人看。",
152
+ # Arabic
153
+ "أنا ممتن جدًا لكل ما فعلته من أجلي.",
154
+ "لا أستطيع تحمّل هذا الوضع أكثر من ذلك.",
155
+ # Hindi
156
+ "आपका यह तोहफ़ा देखकर मेरी आँखों में आँसू आ गए।",
157
+ "यह सेवा बिल्कुल घटिया थी, मैं बहुत निराश हूँ।",
158
+ # Japanese
159
+ "久しぶりに会えて本当に嬉しいです!",
160
+ "また電車が遅れた...本当にうんざりする。",
161
+ # French
162
+ "Je suis tellement reconnaissant pour tout ce que tu as fait.",
163
+ "C'est inadmissible, j'en ai assez de cette situation.",
164
+ # Swahili
165
+ "Asante sana kwa msaada wako, nakupenda sana!",
166
+ "Nimechoka kabisa na huduma hii mbaya.",
167
+ ]
168
+
169
+ for t, r in zip(texts, predict_emotions(texts)):
170
+ tags = ", ".join(f"{lbl}({p:.2f})" for lbl, p in r)
171
+ print(f"Text: {t}\nEmotions: {tags}\n")
172
+ ```
173
+
174
+ Using pipelines (returns probability for each of the 11 classes):
175
+
176
+ ```python
177
+ from transformers import pipeline
178
+
179
+ pipe = pipeline(
180
+ "text-classification",
181
+ model="tabularisai/multilingual-emotion-classification",
182
+ function_to_apply="sigmoid",
183
+ top_k=None,
184
+ )
185
+
186
+ print(pipe("I love this product! It's amazing and works perfectly."))
187
+ ```
188
+
189
+ ## Ethical Considerations
190
+
191
+ Synthetic training data reduces annotator bias and broadens language coverage, but real-world validation is strongly advised before deploying in high-stakes settings. Emotion labels are culturally situated — predictions should be treated as probabilistic signals, not ground truth about a person's internal state.
192
+
193
+ ## Citation
194
+
195
+ ```bib
196
+ @misc{borisov2026multilingual,
197
+ title={Multilingual Multi-Label Emotion Classification at Scale with Synthetic Data},
198
+ author={Vadim Borisov},
199
+ year={2026},
200
+ eprint={2604.12633},
201
+ archivePrefix={arXiv},
202
+ primaryClass={cs.CL},
203
+ url={https://arxiv.org/abs/2604.12633},
204
+ }
205
+ ```
206
+
207
+ ## Contact
208
+
209
+ For inquiries, data, private APIs, better models, contact info@tabularis.ai
210
+
211
+ tabularis.ai
212
+
213
+
214
+ <table align="center">
215
+ <tr>
216
+ <td align="center">
217
+ <a href="https://www.linkedin.com/company/tabularis-ai/">
218
+ <img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/linkedin.svg" alt="LinkedIn" width="30" height="30">
219
+ </a>
220
+ </td>
221
+ <td align="center">
222
+ <a href="https://x.com/tabularis_ai">
223
+ <img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/x.svg" alt="X" width="30" height="30">
224
+ </a>
225
+ </td>
226
+ <td align="center">
227
+ <a href="https://github.com/tabularis-ai">
228
+ <img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/github.svg" alt="GitHub" width="30" height="30">
229
+ </a>
230
+ </td>
231
+ <td align="center">
232
+ <a href="https://tabularis.ai">
233
+ <img src="https://cdn.jsdelivr.net/gh/simple-icons/simple-icons/icons/internetarchive.svg" alt="Website" width="30" height="30">
234
+ </a>
235
+ </td>
236
+ </tr>
237
+ </table>
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "anger",
16
+ "1": "contempt",
17
+ "2": "disgust",
18
+ "3": "fear",
19
+ "4": "frustration",
20
+ "5": "gratitude",
21
+ "6": "joy",
22
+ "7": "love",
23
+ "8": "neutral",
24
+ "9": "sadness",
25
+ "10": "surprise"
26
+ },
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 3072,
29
+ "is_decoder": false,
30
+ "label2id": {
31
+ "anger": 0,
32
+ "contempt": 1,
33
+ "disgust": 2,
34
+ "fear": 3,
35
+ "frustration": 4,
36
+ "gratitude": 5,
37
+ "joy": 6,
38
+ "love": 7,
39
+ "neutral": 8,
40
+ "sadness": 9,
41
+ "surprise": 10
42
+ },
43
+ "layer_norm_eps": 1e-05,
44
+ "max_position_embeddings": 514,
45
+ "model_type": "xlm-roberta",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 12,
48
+ "output_past": true,
49
+ "pad_token_id": 1,
50
+ "position_embedding_type": "absolute",
51
+ "problem_type": "multi_label_classification",
52
+ "tie_word_embeddings": true,
53
+ "transformers_version": "5.5.3",
54
+ "type_vocab_size": 1,
55
+ "use_cache": true,
56
+ "vocab_size": 250002
57
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300fd92943ec7541749bd56fcb3f4c037d2ba3313bf54c0c4046913156fdd3c3
3
+ size 1112232668
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbd420e2269cdc1ef45332d3d5c418be4aef6b8cb5a0b7ccae0893485307153
3
+ size 17098086
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "XLMRobertaTokenizer",
13
+ "unk_token": "<unk>"
14
+ }