poltextlab commited on
Commit
f567574
·
verified ·
1 Parent(s): cc91f78

Automated hub push by babel_finetune_agent

Browse files
Files changed (5) hide show
  1. README.md +84 -0
  2. config.json +36 -0
  3. finetune_config.json +9 -0
  4. model.safetensors +3 -0
  5. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ model-index:
3
+ - name: poltextlab/xlm-roberta-large-i5-binary-codebook-v16
4
+ results:
5
+ - task:
6
+ type: text-classification
7
+ metrics:
8
+ - name: Accuracy
9
+ type: accuracy
10
+ value: N/A
11
+ - name: F1-Score
12
+ type: f1
13
+ value: 78%
14
+ tags:
15
+ - text-classification
16
+ - pytorch
17
+ metrics:
18
+ - precision
19
+ - recall
20
+ - f1-score
21
+ language:
22
+ - en
23
+ base_model:
24
+ - xlm-roberta-large
25
+ pipeline_tag: text-classification
26
+ library_name: transformers
27
+ license: cc-by-4.0
28
+ extra_gated_prompt: Our models are intended for academic projects and academic research
29
+ only.If you are not affiliated with an academic institution, please reach out to
30
+ us at huggingface [at] poltextlab [dot] com for further inquiry.If we cannot clearly
31
+ determine your academic affiliation and use case based on your form data, your request
32
+ may be rejected. Please allow us a few business days to manually review subscriptions.
33
+ extra_gated_fields:
34
+ Name: text
35
+ Country: country
36
+ Institution: text
37
+ Institution Email: text
38
+ Please specify your academic use case: text
39
+ ---
40
+
41
+ # xlm-roberta-large-i5-binary-codebook-v16
42
+
43
+
44
+ # How to use the model
45
+
46
+ ```python
47
+ from transformers import AutoTokenizer, pipeline
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
50
+ pipe = pipeline(
51
+ model="poltextlab/xlm-roberta-large-i5-binary-codebook-v16",
52
+ task="text-classification",
53
+ tokenizer=tokenizer,
54
+ use_fast=False,
55
+ token="<your_hf_read_only_token>"
56
+ )
57
+
58
+ text = "<text_to_classify>"
59
+ pipe(text)
60
+ ```
61
+
62
+
63
+ # Classification Report
64
+
65
+ ## Overall Performance:
66
+
67
+ * **Accuracy:** N/A
68
+ * **Macro Avg:** Precision: 0.78, Recall: 0.78, F1-score: 0.78
69
+ * **Weighted Avg:** Precision: 0.78, Recall: 0.78, F1-score: 0.78
70
+
71
+ ## Per-Class Metrics:
72
+
73
+ | Label | Precision | Recall | F1-score | Support |
74
+ |:------------------|------------:|---------:|-----------:|----------:|
75
+ | (0) Not illiberal | 0.8 | 0.8 | 0.8 | 30 |
76
+ | (1) Illiberal | 0.76 | 0.76 | 0.76 | 25 |
77
+
78
+ # Inference platform
79
+ This model is used by the [CAP Babel Machine](https://babel.poltextlab.com), an open-source and free natural language processing tool, designed to simplify and speed up projects for comparative research.
80
+
81
+ # Cooperation
82
+ Model performance can be significantly improved by extending our training sets. We appreciate every submission of CAP-coded corpora (of any domain and language) at poltextlab{at}poltextlab{dot}com or by using the [CAP Babel Machine](https://babel.poltextlab.com).
83
+ ## Debugging and issues
84
+ This architecture uses the `sentencepiece` tokenizer. In order to run the model before `transformers==4.27` you need to install it manually.
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "Illiberal",
14
+ "1": "Not illiberal"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 4096,
18
+ "label2id": {
19
+ "Illiberal": 0,
20
+ "Not illiberal": 1
21
+ },
22
+ "layer_norm_eps": 1e-05,
23
+ "max_position_embeddings": 514,
24
+ "model_type": "xlm-roberta",
25
+ "num_attention_heads": 16,
26
+ "num_hidden_layers": 24,
27
+ "output_past": true,
28
+ "pad_token_id": 1,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "multi_label_classification",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.51.3",
33
+ "type_vocab_size": 1,
34
+ "use_cache": true,
35
+ "vocab_size": 250002
36
+ }
finetune_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_path": "xlm-roberta-large",
3
+ "model_type": "encoder",
4
+ "learning_rate": 2e-05,
5
+ "epochs": 15,
6
+ "batch_size": 16,
7
+ "max_seq_length": 256,
8
+ "domain": "migration"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d4a748814b7f95793db66e149fb7036ccea8867a92f936eb369d6ea2987d6a
3
+ size 2239618672
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79aa5bd0893849be024a7e424f9b05eafac4570b9f655224334de39e1912154e
3
+ size 5713