Commit
•
41a4455
1
Parent(s):
59478dc
Upload 8 files
Browse files- README.md +69 -0
- config.json +43 -0
- gitattributes +35 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +13 -0
- training_args .bin +3 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pipeline_tag: token-classification
|
3 |
+
tags:
|
4 |
+
- code
|
5 |
+
license: apache-2.0
|
6 |
+
datasets:
|
7 |
+
- Alex123321/english_cefr_dataset
|
8 |
+
language:
|
9 |
+
- en
|
10 |
+
metrics:
|
11 |
+
- accuracy
|
12 |
+
library_name: transformers
|
13 |
+
---
|
14 |
+
# Model Card: BERT-based CEFR Classifier
|
15 |
+
|
16 |
+
## Overview
|
17 |
+
|
18 |
+
This repository contains a model trained to predict Common European Framework of Reference (CEFR) levels for a given text using a BERT-based model architecture. The model was fine-tuned on the CEFR dataset, and the `bert-base-...` pre-trained model was used as the base.
|
19 |
+
|
20 |
+
## Model Details
|
21 |
+
|
22 |
+
- Model architecture: BERT (base model: `bert-base-...`)
|
23 |
+
- Task: CEFR level prediction for text classification
|
24 |
+
- Training dataset: CEFR dataset
|
25 |
+
- Fine-tuning: Epochs, Loss, etc.
|
26 |
+
|
27 |
+
## Performance
|
28 |
+
|
29 |
+
The model's performance during training is summarized below:
|
30 |
+
|
31 |
+
|
32 |
+
| Epoch | Training Loss | Validation Loss |
|
33 |
+
|-------|---------------|-----------------|
|
34 |
+
| 1 | 0.412300 | 0.396337 |
|
35 |
+
| 2 | 0.369600 | 0.388866 |
|
36 |
+
| 3 | 0.298200 | 0.419018 |
|
37 |
+
| 4 | 0.214500 | 0.481886 |
|
38 |
+
| 5 | 0.148300 | 0.557343 |
|
39 |
+
|
40 |
+
--Additional metrics:
|
41 |
+
|
42 |
+
--Training Loss: 0.2900624789151278
|
43 |
+
--Training Runtime: 5168.3962 seconds
|
44 |
+
--Training Samples per Second: 10.642
|
45 |
+
--Total Floating Point Operations: 1.447162776576e+16
|
46 |
+
|
47 |
+
## Usage
|
48 |
+
|
49 |
+
1. Install the required libraries by running `pip install transformers`.
|
50 |
+
2. Load the trained model and use it for CEFR level prediction.
|
51 |
+
|
52 |
+
|
53 |
+
from transformers import pipeline
|
54 |
+
|
55 |
+
# Load the model
|
56 |
+
model_name = "AbdulSami/bert-base-cased-cefr"
|
57 |
+
|
58 |
+
classifier = pipeline("text-classification", model=model_name)
|
59 |
+
|
60 |
+
# Text for prediction
|
61 |
+
text = "This is a sample text for CEFR classification."
|
62 |
+
|
63 |
+
# Predict CEFR level
|
64 |
+
predictions = classifier(text)
|
65 |
+
|
66 |
+
# Print the predictions
|
67 |
+
print(predictions)
|
68 |
+
|
69 |
+
|
config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-cased",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "A1",
|
14 |
+
"1": "A2",
|
15 |
+
"2": "B1",
|
16 |
+
"3": "B2",
|
17 |
+
"4": "C1",
|
18 |
+
"5": "C2"
|
19 |
+
},
|
20 |
+
"initializer_range": 0.02,
|
21 |
+
"intermediate_size": 3072,
|
22 |
+
"label2id": {
|
23 |
+
"A1": 0,
|
24 |
+
"A2": 1,
|
25 |
+
"B1": 2,
|
26 |
+
"B2": 3,
|
27 |
+
"C1": 4,
|
28 |
+
"C2": 5
|
29 |
+
},
|
30 |
+
"layer_norm_eps": 1e-12,
|
31 |
+
"max_position_embeddings": 512,
|
32 |
+
"model_type": "bert",
|
33 |
+
"num_attention_heads": 12,
|
34 |
+
"num_hidden_layers": 12,
|
35 |
+
"pad_token_id": 0,
|
36 |
+
"position_embedding_type": "absolute",
|
37 |
+
"problem_type": "multi_label_classification",
|
38 |
+
"torch_dtype": "float32",
|
39 |
+
"transformers_version": "4.31.0",
|
40 |
+
"type_vocab_size": 2,
|
41 |
+
"use_cache": true,
|
42 |
+
"vocab_size": 28996
|
43 |
+
}
|
gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_lower_case": false,
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"model_max_length": 512,
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"strip_accents": null,
|
10 |
+
"tokenize_chinese_chars": true,
|
11 |
+
"tokenizer_class": "BertTokenizer",
|
12 |
+
"unk_token": "[UNK]"
|
13 |
+
}
|
training_args .bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb9f07591dac1a3d3595b59c1a5fe79dd94c04fdf86b50f7e064b5862a58b45b
|
3 |
+
size 3963
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|