aloychow commited on
Commit
afde400
1 Parent(s): 350a544

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +130 -1
  2. config.json +37 -0
  3. merges.txt +0 -0
  4. pytorch_model.bin +3 -0
  5. special_tokens_map.json +1 -0
  6. tf_model.h5 +3 -0
  7. vocab.json +0 -0
README.md CHANGED
@@ -1,3 +1,132 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: en
3
+ widget:
4
+ - text: Covid cases are increasing fast!
5
+ datasets:
6
+ - tweet_eval
7
  ---
8
+
9
+
10
+ # Twitter-roBERTa-base for Sentiment Analysis - UPDATED (2022)
11
+
12
+ This is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the TweetEval benchmark.
13
+ The original Twitter-based RoBERTa model can be found [here](https://huggingface.co/cardiffnlp/twitter-roberta-base-2021-124m) and the original reference paper is [TweetEval](https://github.com/cardiffnlp/tweeteval). This model is suitable for English.
14
+
15
+ - Reference Paper: [TimeLMs paper](https://arxiv.org/abs/2202.03829).
16
+ - Git Repo: [TimeLMs official repository](https://github.com/cardiffnlp/timelms).
17
+
18
+ <b>Labels</b>:
19
+ 0 -> Negative;
20
+ 1 -> Neutral;
21
+ 2 -> Positive
22
+
23
+ This sentiment analysis model has been integrated into [TweetNLP](https://github.com/cardiffnlp/tweetnlp). You can access the demo [here](https://tweetnlp.org).
24
+
25
+ ## Example Pipeline
26
+ ```python
27
+ from transformers import pipeline
28
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
29
+ sentiment_task("Covid cases are increasing fast!")
30
+ ```
31
+ ```
32
+ [{'label': 'Negative', 'score': 0.7236}]
33
+ ```
34
+
35
+ ## Full classification example
36
+
37
+ ```python
38
+ from transformers import AutoModelForSequenceClassification
39
+ from transformers import TFAutoModelForSequenceClassification
40
+ from transformers import AutoTokenizer, AutoConfig
41
+ import numpy as np
42
+ from scipy.special import softmax
43
+ # Preprocess text (username and link placeholders)
44
+ def preprocess(text):
45
+ new_text = []
46
+ for t in text.split(" "):
47
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
48
+ t = 'http' if t.startswith('http') else t
49
+ new_text.append(t)
50
+ return " ".join(new_text)
51
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
52
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
53
+ config = AutoConfig.from_pretrained(MODEL)
54
+ # PT
55
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
56
+ #model.save_pretrained(MODEL)
57
+ text = "Covid cases are increasing fast!"
58
+ text = preprocess(text)
59
+ encoded_input = tokenizer(text, return_tensors='pt')
60
+ output = model(**encoded_input)
61
+ scores = output[0][0].detach().numpy()
62
+ scores = softmax(scores)
63
+ # # TF
64
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
65
+ # model.save_pretrained(MODEL)
66
+ # text = "Covid cases are increasing fast!"
67
+ # encoded_input = tokenizer(text, return_tensors='tf')
68
+ # output = model(encoded_input)
69
+ # scores = output[0][0].numpy()
70
+ # scores = softmax(scores)
71
+ # Print labels and scores
72
+ ranking = np.argsort(scores)
73
+ ranking = ranking[::-1]
74
+ for i in range(scores.shape[0]):
75
+ l = config.id2label[ranking[i]]
76
+ s = scores[ranking[i]]
77
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
78
+ ```
79
+
80
+ Output:
81
+
82
+ ```
83
+ 1) Negative 0.7236
84
+ 2) Neutral 0.2287
85
+ 3) Positive 0.0477
86
+ ```
87
+
88
+
89
+ ### References
90
+ ```
91
+ @inproceedings{camacho-collados-etal-2022-tweetnlp,
92
+ title = "{T}weet{NLP}: Cutting-Edge Natural Language Processing for Social Media",
93
+ author = "Camacho-collados, Jose and
94
+ Rezaee, Kiamehr and
95
+ Riahi, Talayeh and
96
+ Ushio, Asahi and
97
+ Loureiro, Daniel and
98
+ Antypas, Dimosthenis and
99
+ Boisson, Joanne and
100
+ Espinosa Anke, Luis and
101
+ Liu, Fangyu and
102
+ Mart{\'\i}nez C{\'a}mara, Eugenio" and others,
103
+ booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
104
+ month = dec,
105
+ year = "2022",
106
+ address = "Abu Dhabi, UAE",
107
+ publisher = "Association for Computational Linguistics",
108
+ url = "https://aclanthology.org/2022.emnlp-demos.5",
109
+ pages = "38--49"
110
+ }
111
+
112
+ ```
113
+
114
+ ```
115
+ @inproceedings{loureiro-etal-2022-timelms,
116
+ title = "{T}ime{LM}s: Diachronic Language Models from {T}witter",
117
+ author = "Loureiro, Daniel and
118
+ Barbieri, Francesco and
119
+ Neves, Leonardo and
120
+ Espinosa Anke, Luis and
121
+ Camacho-collados, Jose",
122
+ booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
123
+ month = may,
124
+ year = "2022",
125
+ address = "Dublin, Ireland",
126
+ publisher = "Association for Computational Linguistics",
127
+ url = "https://aclanthology.org/2022.acl-demo.25",
128
+ doi = "10.18653/v1/2022.acl-demo.25",
129
+ pages = "251--260"
130
+ }
131
+
132
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/sentiment/sentiment_latest_2021/",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.13.0.dev0",
35
+ "type_vocab_size": 1,
36
+ "vocab_size": 50265
37
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d24a3e32a88ed1c4e5b789fc6644e2e767500554e954b27dccf52a8e762cbae
3
+ size 501045531
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:682358ffb3869b08a144d5e59325534335729720fe64d5f2b3a543f8e5d14a9e
3
+ size 498845224
vocab.json ADDED
The diff for this file is too large to render. See raw diff