Cesar42 commited on
Commit
1c16b8c
1 Parent(s): 03a2d8b

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,16 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,91 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - es
4
+
5
+ tags:
6
+ - Emotion Analysis
7
+
8
  ---
9
+
10
+ **Note**: This model & model card are based on the [finetuned XLM-T for Sentiment Analysis](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment)
11
+
12
+ # twitter-XLM-roBERTa-base for Emotion Analysis
13
+ This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for emotion analysis on Spanish language. This model was presented to EmoEvalEs competition, part of [IberLEF 2021 Conference](https://sites.google.com/view/iberlef2021/), where the proposed task was the classification of Spanish tweets between seven different classes: *anger*, *disgust*, *fear*, *joy*, *sadness*, *surprise*, and *other*. We achieved the first position in the competition with a macro-averaged F1 score of 71.70%.
14
+ - [Our code for EmoEvalEs submission](https://github.com/gsi-upm/emoevales-iberlef2021).
15
+ - [EmoEvalEs Dataset](https://github.com/pendrag/EmoEvalEs)
16
+ ## Example Pipeline with a [Tweet from @JaSantaolalla](https://twitter.com/JaSantaolalla/status/1398383243645177860)
17
+ ```python
18
+ from transformers import pipeline
19
+ model_path = "daveni/twitter-xlm-roberta-emotion-es"
20
+ emotion_analysis = pipeline("text-classification", framework="pt", model=model_path, tokenizer=model_path)
21
+ emotion_analysis("Einstein dijo: Solo hay dos cosas infinitas, el universo y los pinches anuncios de bitcoin en Twitter. Paren ya carajo aaaaaaghhgggghhh me quiero murir")
22
+ ```
23
+ ```
24
+ [{'label': 'anger', 'score': 0.48307016491889954}]
25
+ ```
26
+ ## Full classification example
27
+ ```python
28
+ from transformers import AutoModelForSequenceClassification
29
+ from transformers import AutoTokenizer, AutoConfig
30
+ import numpy as np
31
+ from scipy.special import softmax
32
+ # Preprocess text (username and link placeholders)
33
+ def preprocess(text):
34
+ new_text = []
35
+ for t in text.split(" "):
36
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
37
+ t = 'http' if t.startswith('http') else t
38
+ new_text.append(t)
39
+ return " ".join(new_text)
40
+ model_path = "daveni/twitter-xlm-roberta-emotion-es"
41
+ tokenizer = AutoTokenizer.from_pretrained(model_path )
42
+ config = AutoConfig.from_pretrained(model_path )
43
+ # PT
44
+ model = AutoModelForSequenceClassification.from_pretrained(model_path )
45
+ text = "Se ha quedao bonito día para publicar vídeo, ¿no? Hoy del tema más diferente que hemos tocado en el canal."
46
+ text = preprocess(text)
47
+ print(text)
48
+ encoded_input = tokenizer(text, return_tensors='pt')
49
+ output = model(**encoded_input)
50
+ scores = output[0][0].detach().numpy()
51
+ scores = softmax(scores)
52
+ # Print labels and scores
53
+ ranking = np.argsort(scores)
54
+ ranking = ranking[::-1]
55
+ for i in range(scores.shape[0]):
56
+ l = config.id2label[ranking[i]]
57
+ s = scores[ranking[i]]
58
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
59
+ ```
60
+ Output:
61
+
62
+ ```
63
+ Se ha quedao bonito día para publicar vídeo, ¿no? Hoy del tema más diferente que hemos tocado en el canal.
64
+ 1) joy 0.7887
65
+ 2) others 0.1679
66
+ 3) surprise 0.0152
67
+ 4) sadness 0.0145
68
+ 5) anger 0.0077
69
+ 6) disgust 0.0033
70
+ 7) fear 0.0027
71
+ ```
72
+
73
+ #### Limitations and bias
74
+
75
+ - The dataset we used for finetuning was unbalanced, where almost half of the records belonged to the *other* class so there might be bias towards this class.
76
+
77
+
78
+ ## Training data
79
+
80
+ Pretrained weights were left identical to the original model released by [cardiffnlp](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base). We used the [EmoEvalEs Dataset](https://github.com/pendrag/EmoEvalEs) for finetuning.
81
+
82
+ ### BibTeX entry and citation info
83
+
84
+ ```bibtex
85
+ @inproceedings{vera2021gsi,
86
+ title={GSI-UPM at IberLEF2021: Emotion Analysis of Spanish Tweets by Fine-tuning the XLM-RoBERTa Language Model},
87
+ author={Vera, D and Araque, O and Iglesias, CA},
88
+ booktitle={Proceedings of the Iberian Languages Evaluation Forum (IberLEF 2021). CEUR Workshop Proceedings, CEUR-WS, M{\'a}laga, Spain},
89
+ year={2021}
90
+ }
91
+ ```
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./resultsEmotion-preproc/best_model",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "sadness",
15
+ "1": "joy",
16
+ "2": "anger",
17
+ "3": "surprise",
18
+ "4": "disgust",
19
+ "5": "fear",
20
+ "6": "others"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 3072,
24
+ "label2id": {
25
+ "anger": 2,
26
+ "disgust": 4,
27
+ "fear": 5,
28
+ "joy": 1,
29
+ "others": 6,
30
+ "sadness": 0,
31
+ "surprise": 3
32
+ },
33
+ "layer_norm_eps": 1e-05,
34
+ "max_position_embeddings": 514,
35
+ "model_type": "xlm-roberta",
36
+ "num_attention_heads": 12,
37
+ "num_hidden_layers": 12,
38
+ "output_past": true,
39
+ "pad_token_id": 1,
40
+ "position_embedding_type": "absolute",
41
+ "transformers_version": "4.6.1",
42
+ "type_vocab_size": 1,
43
+ "use_cache": true,
44
+ "vocab_size": 250002
45
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90c31580c660db1b7d5b10bfc3595dc7a7d9cdac699e68e25e5e047c904fa35a
3
+ size 1112283849
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": null, "name_or_path": "cardiffnlp/twitter-xlm-roberta-base"}