bigint commited on
Commit
cef83d4
·
1 Parent(s): f99a439

fix: conflicts

Browse files
.gitattributes CHANGED
@@ -2,34 +2,26 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.onnx filter=lfs diff=lfs merge=lfs -text
13
  *.ot filter=lfs diff=lfs merge=lfs -text
14
  *.parquet filter=lfs diff=lfs merge=lfs -text
15
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pt filter=lfs diff=lfs merge=lfs -text
17
  *.pth filter=lfs diff=lfs merge=lfs -text
18
  *.rar filter=lfs diff=lfs merge=lfs -text
 
19
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tflite filter=lfs diff=lfs merge=lfs -text
22
  *.tgz filter=lfs diff=lfs merge=lfs -text
23
  *.wasm filter=lfs diff=lfs merge=lfs -text
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ widget:
4
+ - text: It is great to see athletes promoting awareness for climate change.
5
+ datasets:
6
+ - cardiffnlp/tweet_topic_multi
7
+ license: mit
8
+ metrics:
9
+ - f1
10
+ - accuracy
11
+ pipeline_tag: text-classification
12
+ ---
13
+
14
+ # tweet-topic-21-multi
15
+
16
+ This model is based on a [TimeLMs](https://github.com/cardiffnlp/timelms) language model trained on ~124M tweets from January 2018 to December 2021 (see [here](https://huggingface.co/cardiffnlp/twitter-roberta-base-2021-124m)), and finetuned for multi-label topic classification on a corpus of 11,267 [tweets](https://huggingface.co/datasets/cardiffnlp/tweet_topic_multi). This model is suitable for English.
17
+
18
+ - Reference Paper: [TweetTopic](https://arxiv.org/abs/2209.09824) (COLING 2022).
19
+
20
+ <b>Labels</b>:
21
+
22
+
23
+ | <span style="font-weight:normal">0: arts_&_culture</span> | <span style="font-weight:normal">5: fashion_&_style</span> | <span style="font-weight:normal">10: learning_&_educational</span> | <span style="font-weight:normal">15: science_&_technology</span> |
24
+ |-----------------------------|---------------------|----------------------------|--------------------------|
25
+ | 1: business_&_entrepreneurs | 6: film_tv_&_video | 11: music | 16: sports |
26
+ | 2: celebrity_&_pop_culture | 7: fitness_&_health | 12: news_&_social_concern | 17: travel_&_adventure |
27
+ | 3: diaries_&_daily_life | 8: food_&_dining | 13: other_hobbies | 18: youth_&_student_life |
28
+ | 4: family | 9: gaming | 14: relationships | |
29
+
30
+
31
+ ## Full classification example
32
+
33
+ ```python
34
+ from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
35
+ from transformers import AutoTokenizer
36
+ import numpy as np
37
+ from scipy.special import expit
38
+
39
+
40
+ MODEL = f"cardiffnlp/tweet-topic-21-multi"
41
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
42
+
43
+ # PT
44
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
45
+ class_mapping = model.config.id2label
46
+
47
+ text = "It is great to see athletes promoting awareness for climate change."
48
+ tokens = tokenizer(text, return_tensors='pt')
49
+ output = model(**tokens)
50
+
51
+ scores = output[0][0].detach().numpy()
52
+ scores = expit(scores)
53
+ predictions = (scores >= 0.5) * 1
54
+
55
+
56
+ # TF
57
+ #tf_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
58
+ #class_mapping = tf_model.config.id2label
59
+ #text = "It is great to see athletes promoting awareness for climate change."
60
+ #tokens = tokenizer(text, return_tensors='tf')
61
+ #output = tf_model(**tokens)
62
+ #scores = output[0][0]
63
+ #scores = expit(scores)
64
+ #predictions = (scores >= 0.5) * 1
65
+
66
+ # Map to classes
67
+ for i in range(len(predictions)):
68
+ if predictions[i]:
69
+ print(class_mapping[i])
70
+
71
+ ```
72
+ Output:
73
+
74
+ ```
75
+ news_&_social_concern
76
+ sports
77
+ ```
78
+
79
+ ### BibTeX entry and citation info
80
+
81
+ Please cite the [reference paper](https://aclanthology.org/2022.coling-1.299/) if you use this model.
82
+
83
+ ```bibtex
84
+ @inproceedings{antypas-etal-2022-twitter,
85
+ title = "{T}witter Topic Classification",
86
+ author = "Antypas, Dimosthenis and
87
+ Ushio, Asahi and
88
+ Camacho-Collados, Jose and
89
+ Silva, Vitor and
90
+ Neves, Leonardo and
91
+ Barbieri, Francesco",
92
+ booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
93
+ month = oct,
94
+ year = "2022",
95
+ address = "Gyeongju, Republic of Korea",
96
+ publisher = "International Committee on Computational Linguistics",
97
+ url = "https://aclanthology.org/2022.coling-1.299",
98
+ pages = "3386--3400"
99
+ }
100
+ ```
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "antypasd/tweet-topic-21-multi",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "arts_&_culture",
16
+ "1": "business_&_entrepreneurs",
17
+ "2": "celebrity_&_pop_culture",
18
+ "3": "diaries_&_daily_life",
19
+ "4": "family",
20
+ "5": "fashion_&_style",
21
+ "6": "film_tv_&_video",
22
+ "7": "fitness_&_health",
23
+ "8": "food_&_dining",
24
+ "9": "gaming",
25
+ "10": "learning_&_educational",
26
+ "11": "music",
27
+ "12": "news_&_social_concern",
28
+ "13": "other_hobbies",
29
+ "14": "relationships",
30
+ "15": "science_&_technology",
31
+ "16": "sports",
32
+ "17": "travel_&_adventure",
33
+ "18": "youth_&_student_life"
34
+ },
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": 3072,
37
+ "label2id": {
38
+ "arts_&_culture": 0,
39
+ "business_&_entrepreneurs": 1,
40
+ "celebrity_&_pop_culture": 2,
41
+ "diaries_&_daily_life": 3,
42
+ "family": 4,
43
+ "fashion_&_style": 5,
44
+ "film_tv_&_video": 6,
45
+ "fitness_&_health": 7,
46
+ "food_&_dining": 8,
47
+ "gaming": 9,
48
+ "learning_&_educational": 10,
49
+ "music": 11,
50
+ "news_&_social_concern": 12,
51
+ "other_hobbies": 13,
52
+ "relationships": 14,
53
+ "science_&_technology": 15,
54
+ "sports": 16,
55
+ "travel_&_adventure": 17,
56
+ "youth_&_student_life": 18
57
+ },
58
+ "layer_norm_eps": 1e-05,
59
+ "max_position_embeddings": 514,
60
+ "model_type": "roberta",
61
+ "num_attention_heads": 12,
62
+ "num_hidden_layers": 12,
63
+ "pad_token_id": 1,
64
+ "position_embedding_type": "absolute",
65
+ "problem_type": "multi_label_classification",
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.19.2",
68
+ "type_vocab_size": 1,
69
+ "use_cache": true,
70
+ "vocab_size": 50265
71
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b215c9f18a58753b4d276d2acf71f16d09467463c176971fd7a7ea37172377e6
3
+ size 498723565
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e8a502c449ee300305442f555e20a006af45853d8a7d005fc0dd9c771b184d
3
+ size 498930560
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": "/home/antypasd/.cache/huggingface/transformers/601312a9cb96656475ff2ef71b3b002f803e0889279718ab471aed2c84b95b18.a11ebb04664c067c8fe5ef8f8068b0f721263414a26058692f7b2e4ba2a1b342", "name_or_path": "cardiffnlp/twitter-roberta-base-sentiment-latest", "tokenizer_class": "RobertaTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff