Spaces:

qgyd2021
/

telemarketing_intent_classification

Sleeping

App Files Files Community

qgyd2021 commited on Oct 31, 2023

Commit

147e44c

1 Parent(s): 71c05fc

[update]add code

Browse files

Files changed (49) hide show

.gitattributes +1 -0
.gitignore +8 -0
README.md +4 -5
examples/text_classification/telemarketing_intent_classification/1.prepare_data.py +85 -0
examples/text_classification/telemarketing_intent_classification/2.make_hierarchical_labels.py +74 -0
examples/text_classification/telemarketing_intent_classification/3.make_vocabulary.py +78 -0
examples/text_classification/telemarketing_intent_classification/4.train_model.py +172 -0
examples/text_classification/telemarketing_intent_classification/5.predict_model.py +117 -0
examples/text_classification/telemarketing_intent_classification/6.make_json_config.py +121 -0
examples/text_classification/telemarketing_intent_classification/7.predict_by_archive.py +74 -0
examples/text_classification/telemarketing_intent_classification/run.sh +254 -0
main.py +141 -0
predict.py +142 -0
pretrained_models/bert-base-japanese/.gitattributes +9 -0
pretrained_models/bert-base-japanese/README.md +43 -0
pretrained_models/bert-base-japanese/config.json +20 -0
pretrained_models/bert-base-japanese/tokenizer_config.json +5 -0
pretrained_models/bert-base-japanese/vocab.txt +0 -0
pretrained_models/bert-base-uncased/.gitattributes +11 -0
pretrained_models/bert-base-uncased/LICENSE +201 -0
pretrained_models/bert-base-uncased/README.md +251 -0
pretrained_models/bert-base-uncased/config.json +23 -0
pretrained_models/bert-base-uncased/tokenizer.json +0 -0
pretrained_models/bert-base-uncased/tokenizer_config.json +3 -0
pretrained_models/bert-base-uncased/vocab.txt +0 -0
pretrained_models/bert-base-vietnamese-uncased/.gitattributes +17 -0
pretrained_models/bert-base-vietnamese-uncased/README.md +22 -0
pretrained_models/bert-base-vietnamese-uncased/config.json +27 -0
pretrained_models/bert-base-vietnamese-uncased/special_tokens_map.json +1 -0
pretrained_models/bert-base-vietnamese-uncased/tokenizer_config.json +1 -0
pretrained_models/bert-base-vietnamese-uncased/vocab.txt +0 -0
pretrained_models/chinese-bert-wwm-ext/.gitattributes +9 -0
pretrained_models/chinese-bert-wwm-ext/README.md +52 -0
pretrained_models/chinese-bert-wwm-ext/added_tokens.json +1 -0
pretrained_models/chinese-bert-wwm-ext/config.json +26 -0
pretrained_models/chinese-bert-wwm-ext/special_tokens_map.json +1 -0
pretrained_models/chinese-bert-wwm-ext/tokenizer.json +0 -0
pretrained_models/chinese-bert-wwm-ext/tokenizer_config.json +1 -0
pretrained_models/chinese-bert-wwm-ext/vocab.txt +0 -0
project_settings.py +12 -0
requirements.txt +14 -0
toolbox/__init__.py +6 -0
toolbox/allennlp_models/text_classifier/dataset_readers/__init__.py +6 -0
toolbox/allennlp_models/text_classifier/dataset_readers/hierarchical_classification_json.py +99 -0
toolbox/allennlp_models/text_classifier/models/__init__.py +6 -0
toolbox/allennlp_models/text_classifier/models/hierarchical_text_classifier.py +291 -0
toolbox/torch/__init__.py +6 -0
toolbox/torch/modules/__init__.py +6 -0
toolbox/torch/modules/loss.py +738 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.th filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.git/
+.idea/
+**/flagged/
+**/__pycache__/
+trained_models/

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: Telemarketing Intent Classification
-emoji: 🐢
 colorFrom: indigo
-colorTo: gray
 sdk: gradio
-sdk_version: 3.50.2
-app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Telemarketing Intent Classification
+emoji: 😻
 colorFrom: indigo
+colorTo: yellow
 sdk: gradio
+sdk_version: 3.20.1
+app_file: main.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

examples/text_classification/telemarketing_intent_classification/1.prepare_data.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from pathlib import Path
+import random
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, '../../../'))
+import pandas as pd
+from tqdm import tqdm
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--without_irrelevant_domain', action='store_true')
+    parser.add_argument('--dataset_filename', default='dataset.xlsx', type=str)
+    parser.add_argument('--do_lowercase', action='store_true')
+    parser.add_argument('--train_subset', default='train.json', type=str)
+    parser.add_argument('--valid_subset', default='valid.json', type=str)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    n_hierarchical = 2
+    df = pd.read_excel(args.dataset_filename)
+    df = df[df['selected'] == 1]
+    dataset = list()
+    for i, row in tqdm(df.iterrows(), total=len(df)):
+        text = row['text']
+        label0 = row['label0']
+        if args.without_irrelevant_domain and label0 == '无关领域':
+            continue
+        text = str(text)
+        if args.do_lowercase:
+            text = text.lower()
+        labels = {'label{}'.format(idx): str(row['label{}'.format(idx)]) for idx in range(n_hierarchical)}
+        random1 = random.random()
+        random2 = random.random()
+        dataset.append({
+            'text': text,
+            **labels,
+            'random1': random1,
+            'random2': random2,
+            'flag': 'TRAIN' if random2 < 0.8 else 'TEST',
+        })
+    dataset = list(sorted(dataset, key=lambda x: x['random1'], reverse=True))
+    f_train = open(args.train_subset, 'w', encoding='utf-8')
+    f_test = open(args.valid_subset, 'w', encoding='utf-8')
+    for row in tqdm(dataset):
+        flag = row['flag']
+        row = json.dumps(row, ensure_ascii=False)
+        if flag == 'TRAIN':
+            f_train.write('{}\n'.format(row))
+        else:
+            f_test.write('{}\n'.format(row))
+    f_train.close()
+    f_test.close()
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/2.make_hierarchical_labels.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+from collections import OrderedDict
+import os
+from pathlib import Path
+import pickle
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, '../../'))
+import pandas as pd
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_filename', default='dataset.xlsx', type=str)
+    parser.add_argument('--hierarchical_labels_pkl', default='hierarchical_labels.pkl', type=str)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    n_hierarchical = 2
+    df = pd.read_excel(args.dataset_filename)
+    df = df[df['selected'] == 1]
+    # 生成 hierarchical_labels
+    temp_hierarchical_labels = OrderedDict()
+    for i, row in df.iterrows():
+        text = row['text']
+        label0 = row['label0']
+        label1 = row['label1']
+        if temp_hierarchical_labels.get(label0) is None:
+            temp_hierarchical_labels[label0] = list()
+        if label1 not in temp_hierarchical_labels[label0]:
+            temp_hierarchical_labels[label0].append(label1)
+    if n_hierarchical > 2:
+        hierarchical_labels = OrderedDict()
+        for idx in range(n_hierarchical - 2):
+            for k, v in temp_hierarchical_labels.items():
+                parent, label = k.rsplit('_', maxsplit=1)
+                if hierarchical_labels.get(parent) is None:
+                    hierarchical_labels[parent] = OrderedDict({
+                        label: v
+                    })
+                else:
+                    if hierarchical_labels[parent].get(label) is None:
+                        hierarchical_labels[parent][label] = v
+    else:
+        hierarchical_labels = temp_hierarchical_labels
+    with open(args.hierarchical_labels_pkl, 'wb') as f:
+        pickle.dump(hierarchical_labels, f)
+    with open(args.hierarchical_labels_pkl, 'rb') as f:
+        hierarchical_labels = pickle.load(f)
+    # print(hierarchical_labels)
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/3.make_vocabulary.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+from collections import OrderedDict
+import json
+import os
+from pathlib import Path
+import pickle
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, '../../../'))
+import pandas as pd
+from allennlp.data.vocabulary import Vocabulary
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_path",
+        default=(project_path / "pretrained_models/chinese-bert-wwm-ext").as_posix(),
+        type=str
+    )
+    parser.add_argument('--hierarchical_labels_pkl', default='hierarchical_labels.pkl', type=str)
+    parser.add_argument('--vocabulary', default='vocabulary', type=str)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    with open(args.hierarchical_labels_pkl, 'rb') as f:
+        hierarchical_labels = pickle.load(f)
+    # print(hierarchical_labels)
+    # 深度遍历
+    token_to_index = OrderedDict()
+    tasks = [hierarchical_labels]
+    while len(tasks) != 0:
+        task = tasks.pop(0)
+        for parent, downstream in task.items():
+            if isinstance(downstream, list):
+                for label in downstream:
+                    if pd.isna(label):
+                        continue
+                    label = '{}_{}'.format(parent, label)
+                    token_to_index[label] = len(token_to_index)
+            elif isinstance(downstream, OrderedDict):
+                new_task = OrderedDict()
+                for k, v in downstream.items():
+                    new_task['{}_{}'.format(parent, k)] = v
+                tasks.append(new_task)
+            else:
+                raise NotImplementedError
+    vocabulary = Vocabulary(non_padded_namespaces=['tokens', 'labels'])
+    for label, index in token_to_index.items():
+        vocabulary.add_token_to_namespace(label, namespace='labels')
+    vocabulary.set_from_file(
+        filename=os.path.join(args.pretrained_model_path, 'vocab.txt'),
+        is_padded=False,
+        oov_token='[UNK]',
+        namespace='tokens',
+    )
+    vocabulary.save_to_files(args.vocabulary)
+    print('注意检查 Vocabulary 中标签的顺序与 hierarchical_labels 是否一致. ')
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/4.train_model.py ADDED Viewed

	@@ -0,0 +1,172 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+多层 softmax 实现多极文本分类
+由于初始化时, 各层 softmax 的概率趋于平衡.
+因此在第一层时 `领域无关` 就分到了 50% 的概率.
+`领域相关` 中的各类别去分剩下的 50% 的概率.
+这会导致模型一开始时输出的类别全是 `领域无关`, 这导致模型无法优化.
+解决方案:
+1. 从数据集中去除 `领域无关` 数据. 并训练模型.
+2. 等模型收敛之后, 再使用包含 `领域无关` 的数据集, 让模型加载之前的权重, 并重新开始训练模型.
+"""
+import argparse
+import json
+import os
+import sys
+pwd = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(pwd, '../../../'))
+from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader
+from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from allennlp.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer
+from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
+from allennlp.modules.token_embedders.embedding import Embedding
+from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
+from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
+from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
+from allennlp.training.checkpointer import Checkpointer
+from pytorch_pretrained_bert.optimization import BertAdam
+import torch
+from project_settings import project_path
+from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
+from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_path",
+        default=(project_path / "pretrained_models/chinese-bert-wwm-ext").as_posix(),
+        type=str
+    )
+    parser.add_argument('--hierarchical_labels_pkl', default='data_dir/hierarchical_labels.pkl', type=str)
+    parser.add_argument('--vocabulary_dir', default='data_dir/vocabulary', type=str)
+    parser.add_argument('--train_subset', default='data_dir/train.json', type=str)
+    parser.add_argument('--valid_subset', default='data_dir/valid.json', type=str)
+    parser.add_argument("--serialization_dir", default="data_dir/serialization_dir", type=str)
+    # parser.add_argument('--checkpoint_path', default="data_dir/serialization_dir/best.th", type=str)
+    parser.add_argument('--checkpoint_path', default=None, type=str)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    dataset_reader = HierarchicalClassificationJsonReader(
+        token_indexers={
+            'tokens': SingleIdTokenIndexer(
+                namespace='tokens',
+                lowercase_tokens=True,
+                token_min_padding_length=5,
+            )
+        },
+        tokenizer=PretrainedTransformerTokenizer(
+            model_name=os.path.join(project_path, args.pretrained_model_path),
+        ),
+    )
+    vocabulary = Vocabulary.from_files(args.vocabulary_dir)
+    data_loader = MultiProcessDataLoader(
+        reader=dataset_reader,
+        data_path=args.train_subset,
+        batch_size=64,
+        shuffle=True,
+    )
+    data_loader.index_with(vocabulary)
+    validation_data_loader = MultiProcessDataLoader(
+        reader=dataset_reader,
+        data_path=args.valid_subset,
+        batch_size=64,
+        shuffle=True,
+    )
+    validation_data_loader.index_with(vocabulary)
+    model = HierarchicalClassifier(
+        vocab=vocabulary,
+        hierarchical_labels_pkl=args.hierarchical_labels_pkl,
+        text_field_embedder=BasicTextFieldEmbedder(
+            token_embedders={
+                'tokens': Embedding(
+                    num_embeddings=vocabulary.get_vocab_size('tokens'),
+                    embedding_dim=128,
+                )
+            }
+        ),
+        seq2seq_encoder=StackedSelfAttentionEncoder(
+            input_dim=128,
+            hidden_dim=128,
+            projection_dim=128,
+            feedforward_hidden_dim=128,
+            num_layers=2,
+            num_attention_heads=4,
+            use_positional_encoding=False,
+        ),
+        seq2vec_encoder=CnnEncoder(
+            embedding_dim=128,
+            num_filters=32,
+            ngram_filter_sizes=(2, 3, 4, 5),
+        ),
+    )
+    if args.checkpoint_path is not None:
+        with open(args.checkpoint_path, "rb") as f:
+            state_dict = torch.load(f, map_location=torch.device("cpu"))
+        model.load_state_dict(state_dict)
+    model.train()
+    parameters = [v for n, v in model.named_parameters()]
+    optimizer = BertAdam(
+        params=parameters,
+        lr=5e-4,
+        warmup=0.1,
+        t_total=10000,
+        # t_total=200000,
+        schedule='warmup_linear'
+    )
+    if torch.cuda.is_available():
+        cuda_device = 0
+        model.cuda(device=0)
+    else:
+        cuda_device = -1
+    print(cuda_device)
+    trainer = GradientDescentTrainer(
+        cuda_device=cuda_device,
+        model=model,
+        optimizer=optimizer,
+        checkpointer=Checkpointer(
+            serialization_dir=args.serialization_dir,
+            keep_most_recent_by_count=10,
+        ),
+        data_loader=data_loader,
+        validation_data_loader=validation_data_loader,
+        patience=5,
+        validation_metric='+accuracy',
+        num_epochs=100,
+        serialization_dir=args.serialization_dir,
+    )
+    trainer.train()
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/5.predict_model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import time
+from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
+from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
+from allennlp.modules.token_embedders.embedding import Embedding
+from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
+from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
+from allennlp.predictors.text_classifier import TextClassifierPredictor
+import torch
+from project_settings import project_path
+from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
+from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_path",
+        default=(project_path / "pretrained_models/chinese-bert-wwm-ext").as_posix(),
+        type=str
+    )
+    parser.add_argument('--hierarchical_labels_pkl', default='data_dir/hierarchical_labels.pkl', type=str)
+    parser.add_argument('--vocabulary_dir', default='data_dir/vocabulary', type=str)
+    parser.add_argument(
+        "--serialization_dir",
+        default="data_dir/serialization_dir2",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    dataset_reader = HierarchicalClassificationJsonReader(
+        token_indexers={
+            'tokens': SingleIdTokenIndexer(
+                namespace='tokens',
+                lowercase_tokens=True,
+                token_min_padding_length=5,
+            )
+        },
+        tokenizer=PretrainedTransformerTokenizer(
+            model_name=os.path.join(project_path, args.pretrained_model_path),
+        ),
+    )
+    vocabulary = Vocabulary.from_files(args.vocabulary_dir)
+    model = HierarchicalClassifier(
+        vocab=vocabulary,
+        hierarchical_labels_pkl=args.hierarchical_labels_pkl,
+        text_field_embedder=BasicTextFieldEmbedder(
+            token_embedders={
+                'tokens': Embedding(
+                    num_embeddings=vocabulary.get_vocab_size('tokens'),
+                    embedding_dim=128,
+                )
+            }
+        ),
+        seq2seq_encoder=StackedSelfAttentionEncoder(
+            input_dim=128,
+            hidden_dim=128,
+            projection_dim=128,
+            feedforward_hidden_dim=128,
+            num_layers=2,
+            num_attention_heads=4,
+            use_positional_encoding=False,
+        ),
+        seq2vec_encoder=CnnEncoder(
+            embedding_dim=128,
+            num_filters=32,
+            ngram_filter_sizes=(2, 3, 4, 5),
+        ),
+    )
+    checkpoint_path = os.path.join(args.serialization_dir, "best.th")
+    with open(checkpoint_path, 'rb') as f:
+        state_dict = torch.load(f, map_location="cpu")
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    predictor = TextClassifierPredictor(
+        model=model,
+        dataset_reader=dataset_reader,
+    )
+    while True:
+        text = input("text: ")
+        if text == "Quit":
+            break
+        json_dict = {'sentence': text}
+        begin_time = time.time()
+        outputs = predictor.predict_json(
+            json_dict
+        )
+        outputs = predictor._model.decode(outputs)
+        label = outputs['label']
+        print(label)
+        print('time cost: {}'.format(time.time() - begin_time))
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/6.make_json_config.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+from allennlp.data.vocabulary import Vocabulary
+import torch
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_path",
+        default=(project_path / "pretrained_models/chinese-bert-wwm-ext").as_posix(),
+        type=str
+    )
+    parser.add_argument('--hierarchical_labels_pkl', default='data_dir/hierarchical_labels.pkl', type=str)
+    parser.add_argument('--vocabulary_dir', default='data_dir/vocabulary', type=str)
+    parser.add_argument('--train_subset', default='data_dir/train.json', type=str)
+    parser.add_argument('--valid_subset', default='data_dir/valid.json', type=str)
+    parser.add_argument("--serialization_dir", default="data_dir/serialization_dir", type=str)
+    parser.add_argument("--json_config_dir", default="data_dir", type=str)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    vocabulary = Vocabulary.from_files(args.vocabulary_dir)
+    if torch.cuda.is_available():
+        cuda_device = 0
+    else:
+        cuda_device = -1
+    json_config = {
+        "dataset_reader": {
+            "type": "hierarchical_classification_json",
+            "token_indexers": {
+                "tokens": {
+                    "type": "single_id",
+                    "namespace": "tokens",
+                    "lowercase_tokens": True,
+                    "token_min_padding_length": 5
+                }
+            },
+            "tokenizer": {
+                "type": "pretrained_transformer",
+                "model_name": args.pretrained_model_path
+            }
+        },
+        "train_data_path": args.train_subset,
+        "validation_data_path": args.valid_subset,
+        "vocabulary": {
+            "directory_path": args.vocabulary_dir,
+        },
+        "model": {
+            "type": "hierarchical_classifier",
+            "hierarchical_labels_pkl": args.hierarchical_labels_pkl,
+            "text_field_embedder": {
+                "token_embedders": {
+                    "tokens": {
+                        "type": "embedding",
+                        "num_embeddings": vocabulary.get_vocab_size(namespace="tokens"),
+                        "embedding_dim": 128
+                    }
+                }
+            },
+            "seq2seq_encoder": {
+                "type": "stacked_self_attention",
+                "input_dim": 128,
+                "hidden_dim": 128,
+                "projection_dim": 128,
+                "feedforward_hidden_dim": 128,
+                "num_layers": 2,
+                "num_attention_heads": 4,
+                "use_positional_encoding": False
+            },
+            "seq2vec_encoder": {
+                "type": "cnn",
+                "embedding_dim": 128,
+                "num_filters": 32,
+                "ngram_filter_sizes": (2, 3, 4, 5),
+            },
+        },
+        "data_loader": {
+            "type": "multiprocess",
+            "batch_size": 64,
+            "shuffle": True
+        },
+        "trainer": {
+            "type": "gradient_descent",
+            "cuda_device": cuda_device,
+            "optimizer": {
+                "type": "bert_adam",
+                "lr": 5e-5,
+                "warmup": 0.1,
+                "t_total": 50000,
+                "schedule": "warmup_linear"
+            },
+            "checkpointer": {
+                "serialization_dir": args.serialization_dir,
+                "keep_most_recent_by_count": 10
+            },
+            "patience": 5,
+            "validation_metric": "+accuracy",
+            "num_epochs": 200
+        }
+    }
+    with open(os.path.join(args.json_config_dir, "config.json"), "w", encoding="utf-8") as f:
+        json.dump(json_config, f, indent=4, ensure_ascii=False)
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/7.predict_by_archive.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import time
+from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
+from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
+from allennlp.modules.token_embedders.embedding import Embedding
+from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
+from allennlp.models.archival import archive_model, load_archive
+from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
+from allennlp.predictors.predictor import Predictor
+from allennlp.predictors.text_classifier import TextClassifierPredictor
+import torch
+from project_settings import project_path
+from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
+from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text",
+        default="给我推荐一些篮球游戏？",
+        type=str
+    )
+    parser.add_argument(
+        "--archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_vi").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--pretrained_model_path",
+        default=(project_path / "pretrained_models/chinese-bert-wwm-ext").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--predictor_name",
+        default="text_classifier",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    archive = load_archive(archive_file=args.archive_file)
+    predictor = Predictor.from_archive(archive, predictor_name=args.predictor_name)
+    json_dict = {
+        "sentence": args.text
+    }
+    begin_time = time.time()
+    outputs = predictor.predict_json(
+        json_dict
+    )
+    outputs = predictor._model.decode(outputs)
+    label = outputs['label']
+    print(label)
+    print('time cost: {}'.format(time.time() - begin_time))
+    return
+if __name__ == '__main__':
+    main()

examples/text_classification/telemarketing_intent_classification/run.sh ADDED Viewed

	@@ -0,0 +1,254 @@

+#!/usr/bin/env bash
+# nohup sh run.sh --system_version centos --stage 0 --stop_stage 5 &
+# sh run.sh --system_version windows --stage -2 --stop_stage 8
+# sh run.sh --system_version windows --stage 0 --stop_stage 0
+# sh run.sh --system_version windows --stage 1 --stop_stage 1
+# sh run.sh --system_version windows --stage 2 --stop_stage 2
+# sh run.sh --system_version windows --stage 0 --stop_stage 5
+# sh run.sh --system_version windows --stage 6 --stop_stage 6
+# params
+system_version="centos";
+verbose=true;
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=5
+#trained_model_name=telemarketing_intent_classification_cn
+#pretrained_bert_model_name=chinese-bert-wwm-ext
+#dataset_fn="telemarketing_intent_cn.xlsx"
+#trained_model_name=telemarketing_intent_classification_en
+#pretrained_bert_model_name=bert-base-uncased
+#dataset_fn="telemarketing_intent_en.xlsx"
+#trained_model_name=telemarketing_intent_classification_jp
+#pretrained_bert_model_name=bert-base-japanese
+#dataset_fn="telemarketing_intent_jp.xlsx"
+trained_model_name=telemarketing_intent_classification_vi
+pretrained_bert_model_name=bert-base-vietnamese-uncased
+dataset_fn="telemarketing_intent_vi.xlsx"
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+$verbose && echo "system_version: ${system_version}"
+work_dir="$(pwd)"
+data_dir="$(pwd)/data_dir"
+pretrained_models_dir="${work_dir}/../../../pretrained_models";
+trained_models_dir="${work_dir}/../../../trained_models";
+serialization_dir1="${data_dir}/serialization_dir1";
+serialization_dir2="${data_dir}/serialization_dir2";
+mkdir -p "${data_dir}"
+mkdir -p "${pretrained_models_dir}"
+mkdir -p "${trained_models_dir}"
+mkdir -p "${serialization_dir1}"
+mkdir -p "${serialization_dir2}"
+vocabulary_dir="${data_dir}/vocabulary"
+train_subset="${data_dir}/train.json"
+valid_subset="${data_dir}/valid.json"
+hierarchical_labels_pkl="${data_dir}/hierarchical_labels.pkl"
+dataset_filename="${data_dir}/${dataset_fn}"
+export PYTHONPATH="${work_dir}/../../.."
+if [ $system_version == "windows" ]; then
+  alias python3='C:/Users/tianx/PycharmProjects/virtualenv/AllenNLP/Scripts/python.exe'
+elif [ $system_version == "centos" ]; then
+  source /data/local/bin/AllenNLP/bin/activate
+  alias python3='/data/local/bin/AllenNLP/bin/python3'
+elif [ $system_version == "ubuntu" ]; then
+  source /data/local/bin/AllenNLP/bin/activate
+  alias python3='/data/local/bin/AllenNLP/bin/python3'
+fi
+declare -A pretrained_bert_model_dict
+pretrained_bert_model_dict=(
+  ["chinese-bert-wwm-ext"]="https://huggingface.co/hfl/chinese-bert-wwm-ext"
+  ["bert-base-uncased"]="https://huggingface.co/bert-base-uncased"
+  ["bert-base-japanese"]="https://huggingface.co/cl-tohoku/bert-base-japanese"
+  ["bert-base-vietnamese-uncased"]="https://huggingface.co/trituenhantaoio/bert-base-vietnamese-uncased"
+)
+pretrained_model_dir="${pretrained_models_dir}/${pretrained_bert_model_name}"
+if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
+  $verbose && echo "stage -2: download pretrained models"
+  cd "${work_dir}" || exit 1;
+  if [ ! -d "${pretrained_model_dir}" ]; then
+    mkdir -p "${pretrained_models_dir}"
+    cd "${pretrained_models_dir}" || exit 1;
+    repository_url="${pretrained_bert_model_dict[${pretrained_bert_model_name}]}"
+    git clone "${repository_url}"
+    cd "${pretrained_model_dir}" || exit 1;
+    rm flax_model.msgpack && rm pytorch_model.bin && rm tf_model.h5
+    rm -rf .git/
+    wget "${repository_url}/resolve/main/pytorch_model.bin"
+  fi
+fi
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  $verbose && echo "stage -1: download data"
+  cd "${data_dir}" || exit 1;
+  wget "https://huggingface.co/datasets/qgyd2021/telemarketing_intent/resolve/main/${dataset_fn}"
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  $verbose && echo "stage 0: prepare data without irrelevant domain (make train subset, valid subset file)"
+  cd "${work_dir}" || exit 1;
+  python3 1.prepare_data.py \
+  --without_irrelevant_domain \
+  --dataset_filename "${dataset_filename}" \
+  --do_lowercase \
+  --train_subset "${train_subset}" \
+  --valid_subset "${valid_subset}" \
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: make hierarchical labels dictionary (make hierarchical_labels.pkl file)"
+  cd "${work_dir}" || exit 1
+  python3 2.make_hierarchical_labels.py \
+  --dataset_filename "${dataset_filename}" \
+  --hierarchical_labels_pkl "${hierarchical_labels_pkl}" \
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  $verbose && echo "stage 2: make vocabulary (make vocabulary directory)"
+  cd "${work_dir}" || exit 1
+  python3 3.make_vocabulary.py \
+  --pretrained_model_path "${pretrained_model_dir}" \
+  --hierarchical_labels_pkl "${hierarchical_labels_pkl}" \
+  --vocabulary "${vocabulary_dir}" \
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  $verbose && echo "stage 3: train model without irrelevant domain"
+  cd "${work_dir}" || exit 1
+  python3 4.train_model.py \
+  --pretrained_model_path "${pretrained_model_dir}" \
+  --hierarchical_labels_pkl "${hierarchical_labels_pkl}" \
+  --vocabulary_dir "${vocabulary_dir}" \
+  --train_subset "${train_subset}" \
+  --valid_subset "${valid_subset}" \
+  --serialization_dir "${serialization_dir1}" \
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  $verbose && echo "stage 4: prepare data with irrelevant domain"
+  cd "${work_dir}" || exit 1
+  python3 1.prepare_data.py \
+  --dataset_filename "${dataset_filename}" \
+  --do_lowercase \
+  --train_subset "${train_subset}" \
+  --valid_subset "${valid_subset}" \
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  $verbose && echo "stage 5: train model with irrelevant domain"
+  cd "${work_dir}" || exit 1
+  python3 4.train_model.py \
+  --pretrained_model_path "${pretrained_model_dir}" \
+  --hierarchical_labels_pkl "${hierarchical_labels_pkl}" \
+  --vocabulary_dir "${vocabulary_dir}" \
+  --train_subset "${train_subset}" \
+  --valid_subset "${valid_subset}" \
+  --serialization_dir "${serialization_dir2}" \
+  --checkpoint_path "${serialization_dir1}/best.th"
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  $verbose && echo "stage 6: make json config"
+  cd "${work_dir}" || exit 1
+  python3 6.make_json_config.py \
+  --pretrained_model_path "${pretrained_model_dir}" \
+  --hierarchical_labels_pkl "${hierarchical_labels_pkl}" \
+  --vocabulary_dir "${vocabulary_dir}" \
+  --train_subset "${train_subset}" \
+  --valid_subset "${valid_subset}" \
+  --serialization_dir "${serialization_dir2}" \
+  --json_config_dir "${data_dir}" \
+fi
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  $verbose && echo "stage 7: collect files"
+  cd "${work_dir}" || exit 1;
+  mkdir -p "${trained_models_dir}/${trained_model_name}"
+  cp -r "${vocabulary_dir}" "${trained_models_dir}/${trained_model_name}/vocabulary/"
+  cp "${serialization_dir2}/best.th" "${trained_models_dir}/${trained_model_name}/weights.th"
+  cp "${data_dir}/config.json" "${trained_models_dir}/${trained_model_name}/config.json"
+  cp "${hierarchical_labels_pkl}" "${trained_models_dir}/${trained_model_name}"
+fi
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+  $verbose && echo "stage 8: predict by archive"
+  cd "${work_dir}" || exit 1;
+  python3 7.predict_by_archive.py \
+  --archive_file "${trained_models_dir}/${trained_model_name}" \
+  --pretrained_model_path "${pretrained_model_dir}" \
+fi

main.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import time
+from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
+from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
+from allennlp.modules.token_embedders.embedding import Embedding
+from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
+from allennlp.models.archival import archive_model, load_archive
+from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
+from allennlp.predictors.predictor import Predictor
+from allennlp.predictors.text_classifier import TextClassifierPredictor
+import gradio as gr
+import torch
+from project_settings import project_path
+from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
+from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cn_archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_cn").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--en_archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_en").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--jp_archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_jp").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--vi_archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_vi").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--predictor_name",
+        default="text_classifier",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    cn_archive = load_archive(archive_file=args.cn_archive_file)
+    cn_predictor = Predictor.from_archive(cn_archive, predictor_name=args.predictor_name)
+    en_archive = load_archive(archive_file=args.en_archive_file)
+    en_predictor = Predictor.from_archive(en_archive, predictor_name=args.predictor_name)
+    jp_archive = load_archive(archive_file=args.jp_archive_file)
+    jp_predictor = Predictor.from_archive(jp_archive, predictor_name=args.predictor_name)
+    vi_archive = load_archive(archive_file=args.vi_archive_file)
+    vi_predictor = Predictor.from_archive(vi_archive, predictor_name=args.predictor_name)
+    predictor_map = {
+        "chinese": cn_predictor,
+        "english": en_predictor,
+        "japanese": jp_predictor,
+        "vietnamese": vi_predictor,
+    }
+    def fn(text: str, language: str):
+        predictor = predictor_map.get(language, cn_predictor)
+        json_dict = {'sentence': text}
+        outputs = predictor.predict_json(
+            json_dict
+        )
+        outputs = predictor._model.decode(outputs)
+        label = outputs['label'][0]
+        prob = outputs['prob'][0]
+        prob = round(prob, 4)
+        return label, prob
+    description = """
+    电销场景意图识别.
+    语言: 汉语, 英语, 日语, 越南语.
+    数据集是私有的.
+    model: selfattention-cnn
+    dataset: telemarketing_intent (https://huggingface.co/datasets/qgyd2021/telemarketing_intent)
+    accuracy:
+    chinese: 0.8002
+    english: 0.7011
+    japanese: 0.8154
+    vietnamese: 0.8168
+    """
+    demo = gr.Interface(
+        fn=fn,
+        inputs=[
+            gr.Text(label="text"),
+            gr.Dropdown(
+                choices=list(sorted(predictor_map.keys())),
+                label="language"
+            )
+        ],
+        outputs=[gr.Text(label="intent"), gr.Number(label="prob")],
+        examples=[
+            ["你找谁", "chinese"],
+            ["你是谁啊", "chinese"],
+            ["不好意思我现在很忙", "chinese"],
+            ["对不起, 不需要哈", "chinese"],
+            ["u have got the wrong number", "english"],
+            ["sure, thank a lot", "english"],
+            ["please leave your message for 95688496", "english"],
+            ["yes well", "english"],
+            ["失礼の", "japanese"],
+            ["ビートいう発表の後に、お名前とご用件をお話ください。", "japanese"],
+            ["わかんない。", "japanese"],
+            ["に出ることができません", "japanese"],
+            ["À không phải em nha.", "vietnamese"],
+            ["Dạ nhầm số rồi ạ?", "vietnamese"],
+            ["Ừ, cảm ơn em nhá.", "vietnamese"],
+            ["Không, chị không có tiền.", "vietnamese"],
+        ],
+        examples_per_page=50,
+        title="Telemarketing Intent Classification",
+        description=description,
+    )
+    demo.launch()
+    return
+if __name__ == '__main__':
+    main()

predict.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
+from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
+from allennlp.modules.token_embedders.embedding import Embedding
+from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
+from allennlp.models.archival import archive_model, load_archive
+from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
+from allennlp.predictors.predictor import Predictor
+from allennlp.predictors.text_classifier import TextClassifierPredictor
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+from project_settings import project_path
+from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
+from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--excel_file",
+        default=r"D:\Users\tianx\PycharmProjects\telemarketing_intent\data\excel\telemarketing_intent_vi.xlsx",
+        type=str,
+    )
+    parser.add_argument(
+        "--archive_file",
+        default=(project_path / "trained_models/telemarketing_intent_classification_vi").as_posix(),
+        type=str
+    )
+    parser.add_argument(
+        "--predictor_name",
+        default="text_classifier",
+        type=str
+    )
+    parser.add_argument(
+        "--top_k",
+        default=10,
+        type=int
+    )
+    parser.add_argument(
+        "--output_file",
+        default="intent_top_k.jsonl",
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    archive = load_archive(archive_file=args.archive_file)
+    predictor = Predictor.from_archive(archive, predictor_name=args.predictor_name)
+    df = pd.read_excel(args.excel_file)
+    with open(args.output_file, "w", encoding="utf-8") as f:
+        for i, row in tqdm(df.iterrows(), total=len(df)):
+            if i < 26976:
+                continue
+            source = row["source"]
+            text = row["text"]
+            label0 = row["label0"]
+            label1 = row["label1"]
+            selected = row["selected"]
+            checked = row["checked"]
+            if pd.isna(source) or source is None:
+                source = None
+            if pd.isna(text) or text is None:
+                continue
+            text = str(text)
+            if pd.isna(label0) or label0 is None:
+                label0 = None
+            if pd.isna(label1) or label1 is None:
+                label1 = None
+            if pd.isna(selected) or selected is None:
+                selected = None
+            else:
+                try:
+                    selected = int(selected)
+                except Exception:
+                    print(type(selected))
+                    selected = None
+            if pd.isna(checked) or checked is None:
+                checked = None
+            else:
+                try:
+                    checked = int(checked)
+                except Exception:
+                    print(type(checked))
+                    checked = None
+            # print(text)
+            json_dict = {'sentence': text}
+            outputs = predictor.predict_json(
+                json_dict
+            )
+            probs = outputs["probs"]
+            arg_idx = np.argsort(probs)
+            arg_idx_top_k = arg_idx[-10:]
+            label_top_k = [
+                predictor._model.vocab.get_token_from_index(index=idx, namespace="labels").split("_")[-1] for idx in arg_idx_top_k
+            ]
+            prob_top_k = [
+                str(round(probs[idx], 5)) for idx in arg_idx_top_k
+            ]
+            row_ = {
+                "source": source,
+                "text": text,
+                "label0": label0,
+                "label1": label1,
+                "selected": selected,
+                "checked": checked,
+                "predict_label_top_k": ";".join(list(reversed(label_top_k))),
+                "predict_prob_top_k": ";".join(list(reversed(prob_top_k)))
+            }
+            row_ = json.dumps(row_, ensure_ascii=False)
+            f.write("{}\n".format(row_))
+    return
+if __name__ == '__main__':
+    main()

pretrained_models/bert-base-japanese/.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

pretrained_models/bert-base-japanese/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+language: ja
+license: cc-by-sa-4.0
+datasets:
+- wikipedia
+widget:
+- text: 東北大学で[MASK]の研究をしています。
+---
+# BERT base Japanese (IPA dictionary)
+This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
+This version of the model processes input texts with word-level tokenization based on the IPA dictionary, followed by the WordPiece subword tokenization.
+The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/tree/v1.0).
+## Model architecture
+The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
+## Training Data
+The model is trained on Japanese Wikipedia as of September 1, 2019.
+To generate the training corpus, [WikiExtractor](https://github.com/attardi/wikiextractor) is used to extract plain texts from a dump file of Wikipedia articles.
+The text files used for the training are 2.6GB in size, consisting of approximately 17M sentences.
+## Tokenization
+The texts are first tokenized by [MeCab](https://taku910.github.io/mecab/) morphological parser with the IPA dictionary and then split into subwords by the WordPiece algorithm.
+The vocabulary size is 32000.
+## Training
+The model is trained with the same configuration as the original BERT; 512 tokens per instance, 256 instances per batch, and 1M training steps.
+## Licenses
+The pretrained models are distributed under the terms of the [Creative Commons Attribution-ShareAlike 3.0](https://creativecommons.org/licenses/by-sa/3.0/).
+## Acknowledgments
+For training models, we used Cloud TPUs provided by [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc/) program.

pretrained_models/bert-base-japanese/config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "type_vocab_size": 2,
+  "vocab_size": 32000
+}

pretrained_models/bert-base-japanese/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "do_lower_case": false,
+  "subword_tokenizer_type": "wordpiece",
+  "word_tokenizer_type": "mecab"
+}

pretrained_models/bert-base-japanese/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/bert-base-uncased/.gitattributes ADDED Viewed

	@@ -0,0 +1,11 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

pretrained_models/bert-base-uncased/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

pretrained_models/bert-base-uncased/README.md ADDED Viewed

	@@ -0,0 +1,251 @@

+---
+language: en
+tags:
+- exbert
+license: apache-2.0
+datasets:
+- bookcorpus
+- wikipedia
+---
+# BERT base model (uncased)
+Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in
+[this paper](https://arxiv.org/abs/1810.04805) and first released in
+[this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference
+between english and English.
+Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by
+the Hugging Face team.
+## Model description
+BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it
+was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of
+publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it
+was pretrained with two objectives:
+- Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run
+  the entire masked sentence through the model and has to predict the masked words. This is different from traditional
+  recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like
+  GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the
+  sentence.
+- Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes
+  they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to
+  predict if the two sentences were following each other or not.
+This way, the model learns an inner representation of the English language that can then be used to extract features
+useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard
+classifier using the features produced by the BERT model as inputs.
+## Model variations
+BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers.
+Chinese and multilingual uncased and cased versions followed shortly after.
+Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models.
+Other 24 smaller models are released afterward.
+The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github.
+| Model | #params | Language |
+|------------------------|--------------------------------|-------|
+| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M   | English |
+| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased)              | 340M    | English | sub
+| [`bert-base-cased`](https://huggingface.co/bert-base-cased)        | 110M    | English |
+| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M    |  English |
+| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M    | Chinese |
+| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple |
+| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English |
+| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English |
+## Intended uses & limitations
+You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to
+be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for
+fine-tuned versions of a task that interests you.
+Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked)
+to make decisions, such as sequence classification, token classification or question answering. For tasks such as text
+generation you should look at model like GPT2.
+### How to use
+You can use this model directly with a pipeline for masked language modeling:
+```python
+>>> from transformers import pipeline
+>>> unmasker = pipeline('fill-mask', model='bert-base-uncased')
+>>> unmasker("Hello I'm a [MASK] model.")
+[{'sequence': "[CLS] hello i'm a fashion model. [SEP]",
+  'score': 0.1073106899857521,
+  'token': 4827,
+  'token_str': 'fashion'},
+ {'sequence': "[CLS] hello i'm a role model. [SEP]",
+  'score': 0.08774490654468536,
+  'token': 2535,
+  'token_str': 'role'},
+ {'sequence': "[CLS] hello i'm a new model. [SEP]",
+  'score': 0.05338378623127937,
+  'token': 2047,
+  'token_str': 'new'},
+ {'sequence': "[CLS] hello i'm a super model. [SEP]",
+  'score': 0.04667217284440994,
+  'token': 3565,
+  'token_str': 'super'},
+ {'sequence': "[CLS] hello i'm a fine model. [SEP]",
+  'score': 0.027095865458250046,
+  'token': 2986,
+  'token_str': 'fine'}]
+```
+Here is how to use this model to get the features of a given text in PyTorch:
+```python
+from transformers import BertTokenizer, BertModel
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained("bert-base-uncased")
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text, return_tensors='pt')
+output = model(**encoded_input)
+```
+and in TensorFlow:
+```python
+from transformers import BertTokenizer, TFBertModel
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = TFBertModel.from_pretrained("bert-base-uncased")
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text, return_tensors='tf')
+output = model(encoded_input)
+```
+### Limitations and bias
+Even if the training data used for this model could be characterized as fairly neutral, this model can have biased
+predictions:
+```python
+>>> from transformers import pipeline
+>>> unmasker = pipeline('fill-mask', model='bert-base-uncased')
+>>> unmasker("The man worked as a [MASK].")
+[{'sequence': '[CLS] the man worked as a carpenter. [SEP]',
+  'score': 0.09747550636529922,
+  'token': 10533,
+  'token_str': 'carpenter'},
+ {'sequence': '[CLS] the man worked as a waiter. [SEP]',
+  'score': 0.0523831807076931,
+  'token': 15610,
+  'token_str': 'waiter'},
+ {'sequence': '[CLS] the man worked as a barber. [SEP]',
+  'score': 0.04962705448269844,
+  'token': 13362,
+  'token_str': 'barber'},
+ {'sequence': '[CLS] the man worked as a mechanic. [SEP]',
+  'score': 0.03788609802722931,
+  'token': 15893,
+  'token_str': 'mechanic'},
+ {'sequence': '[CLS] the man worked as a salesman. [SEP]',
+  'score': 0.037680890411138535,
+  'token': 18968,
+  'token_str': 'salesman'}]
+>>> unmasker("The woman worked as a [MASK].")
+[{'sequence': '[CLS] the woman worked as a nurse. [SEP]',
+  'score': 0.21981462836265564,
+  'token': 6821,
+  'token_str': 'nurse'},
+ {'sequence': '[CLS] the woman worked as a waitress. [SEP]',
+  'score': 0.1597415804862976,
+  'token': 13877,
+  'token_str': 'waitress'},
+ {'sequence': '[CLS] the woman worked as a maid. [SEP]',
+  'score': 0.1154729500412941,
+  'token': 10850,
+  'token_str': 'maid'},
+ {'sequence': '[CLS] the woman worked as a prostitute. [SEP]',
+  'score': 0.037968918681144714,
+  'token': 19215,
+  'token_str': 'prostitute'},
+ {'sequence': '[CLS] the woman worked as a cook. [SEP]',
+  'score': 0.03042375110089779,
+  'token': 5660,
+  'token_str': 'cook'}]
+```
+This bias will also affect all fine-tuned versions of this model.
+## Training data
+The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038
+unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and
+headers).
+## Training procedure
+### Preprocessing
+The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are
+then of the form:
+```
+[CLS] Sentence A [SEP] Sentence B [SEP]
+```
+With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in
+the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a
+consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two
+"sentences" has a combined length of less than 512 tokens.
+The details of the masking procedure for each sentence are the following:
+- 15% of the tokens are masked.
+- In 80% of the cases, the masked tokens are replaced by `[MASK]`.
+- In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace.
+- In the 10% remaining cases, the masked tokens are left as is.
+### Pretraining
+The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size
+of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer
+used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01,
+learning rate warmup for 10,000 steps and linear decay of the learning rate after.
+## Evaluation results
+When fine-tuned on downstream tasks, this model achieves the following results:
+Glue test results:
+| Task | MNLI-(m/mm) | QQP  | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE  | Average |
+|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:|
+|      | 84.6/83.4   | 71.2 | 90.5 | 93.5  | 52.1 | 85.8  | 88.9 | 66.4 | 79.6    |
+### BibTeX entry and citation info
+```bibtex
+@article{DBLP:journals/corr/abs-1810-04805,
+  author    = {Jacob Devlin and
+               Ming{-}Wei Chang and
+               Kenton Lee and
+               Kristina Toutanova},
+  title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
+               Understanding},
+  journal   = {CoRR},
+  volume    = {abs/1810.04805},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1810.04805},
+  archivePrefix = {arXiv},
+  eprint    = {1810.04805},
+  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+<a href="https://huggingface.co/exbert/?model=bert-base-uncased">
+	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
+</a>

pretrained_models/bert-base-uncased/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.6.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

pretrained_models/bert-base-uncased/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/bert-base-uncased/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "do_lower_case": true
+}

pretrained_models/bert-base-uncased/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/bert-base-vietnamese-uncased/.gitattributes ADDED Viewed

	@@ -0,0 +1,17 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

pretrained_models/bert-base-vietnamese-uncased/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+## Usage
+```python
+from transformers import BertForSequenceClassification
+from transformers import BertTokenizer
+model = BertForSequenceClassification.from_pretrained("trituenhantaoio/bert-base-vietnamese-uncased")
+tokenizer = BertTokenizer.from_pretrained("trituenhantaoio/bert-base-vietnamese-uncased")
+```
+### References
+```
+@article{ttnt2020bert,
+  title={Vietnamese BERT: Pretrained on News and Wiki},
+  author={trituenhantao.io},
+  year = {2020},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/trituenhantaoio/vn-bert-base-uncased}},
+}
+```
+[trituenhantao.io](https://trituenhantao.io)

pretrained_models/bert-base-vietnamese-uncased/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "/content/drive/My Drive/Colab_data/2020/vietnamese-bert-19-11-2020/vietnamese-bert-10-2020/bert_model/pytorch_model",
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.2.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

pretrained_models/bert-base-vietnamese-uncased/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

pretrained_models/bert-base-vietnamese-uncased/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null}

pretrained_models/bert-base-vietnamese-uncased/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/chinese-bert-wwm-ext/.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

pretrained_models/chinese-bert-wwm-ext/README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+---
+language:
+- zh
+license: "apache-2.0"
+---
+## Chinese BERT with Whole Word Masking
+For further accelerating Chinese natural language processing, we provide **Chinese pre-trained BERT with Whole Word Masking**.
+**[Pre-Training with Whole Word Masking for Chinese BERT](https://arxiv.org/abs/1906.08101)**
+Yiming Cui, Wanxiang Che, Ting Liu, Bing Qin, Ziqing Yang, Shijin Wang, Guoping Hu
+This repository is developed based on：https://github.com/google-research/bert
+You may also interested in,
+- Chinese BERT series: https://github.com/ymcui/Chinese-BERT-wwm
+- Chinese MacBERT: https://github.com/ymcui/MacBERT
+- Chinese ELECTRA: https://github.com/ymcui/Chinese-ELECTRA
+- Chinese XLNet: https://github.com/ymcui/Chinese-XLNet
+- Knowledge Distillation Toolkit - TextBrewer: https://github.com/airaria/TextBrewer
+More resources by HFL: https://github.com/ymcui/HFL-Anthology
+## Citation
+If you find the technical report or resource is useful, please cite the following technical report in your paper.
+- Primary: https://arxiv.org/abs/2004.13922
+```
+@inproceedings{cui-etal-2020-revisiting,
+    title = "Revisiting Pre-Trained Models for {C}hinese Natural Language Processing",
+    author = "Cui, Yiming  and
+      Che, Wanxiang  and
+      Liu, Ting  and
+      Qin, Bing  and
+      Wang, Shijin  and
+      Hu, Guoping",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: Findings",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.findings-emnlp.58",
+    pages = "657--668",
+}
+```
+- Secondary: https://arxiv.org/abs/1906.08101
+```
+@article{chinese-bert-wwm,
+  title={Pre-Training with Whole Word Masking for Chinese BERT},
+  author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing and Wang, Shijin and Hu, Guoping},
+  journal={arXiv preprint arXiv:1906.08101},
+  year={2019}
+ }
+```

pretrained_models/chinese-bert-wwm-ext/added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

pretrained_models/chinese-bert-wwm-ext/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "type_vocab_size": 2,
+  "vocab_size": 21128
+}

pretrained_models/chinese-bert-wwm-ext/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

pretrained_models/chinese-bert-wwm-ext/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pretrained_models/chinese-bert-wwm-ext/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"init_inputs": []}

pretrained_models/chinese-bert-wwm-ext/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

project_settings.py ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from pathlib import Path
+project_path = os.path.abspath(os.path.dirname(__file__))
+project_path = Path(project_path)
+if __name__ == '__main__':
+    pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio==3.20.1
+allennlp==2.10.1
+allennlp-models==2.10.1
+torch==1.12.1
+overrides==7.3.1
+pytorch_pretrained_bert==0.6.2
+pydantic==1.10.12
+thinc==7.4.6
+spacy==2.3.9
+fugashi==1.1.2
+ipadic==1.0.0
+pandas==2.0.3
+xlrd==1.2.0
+openpyxl==3.0.9

toolbox/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp_models/text_classifier/dataset_readers/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp_models/text_classifier/dataset_readers/hierarchical_classification_json.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Dict, Iterable, List, Union
+import logging
+import json
+from overrides import overrides
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import LabelField, TextField, Field, ListField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer
+from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
+logger = logging.getLogger(__name__)
+@DatasetReader.register("hierarchical_classification_json")
+class HierarchicalClassificationJsonReader(DatasetReader):
+    def __init__(self,
+                 n_hierarchical: int = 2,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 tokenizer: Tokenizer = None,
+                 segment_sentences: bool = False,
+                 max_sequence_length: int = None,
+                 skip_label_indexing: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self._n_hierarchical = n_hierarchical
+        self._tokenizer = tokenizer or SpacyTokenizer()
+        self._segment_sentences = segment_sentences
+        self._max_sequence_length = max_sequence_length
+        self._skip_label_indexing = skip_label_indexing
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        if self._segment_sentences:
+            self._sentence_segmenter = SpacySentenceSplitter()
+    @overrides
+    def _read(self, file_path) -> Iterable[Instance]:
+        with open(cached_path(file_path), "r", encoding='utf-8') as data_file:
+            for line in data_file.readlines():
+                if not line:
+                    continue
+                items = json.loads(line)
+                text = items["text"]
+                labels = [items.get("label{}".format(idx), None) for idx in range(self._n_hierarchical)]
+                if all(labels):
+                    label = '_'.join(labels)
+                else:
+                    label = None
+                if label is not None:
+                    if self._skip_label_indexing:
+                        try:
+                            label = int(label)
+                        except ValueError:
+                            raise ValueError('Labels must be integers if skip_label_indexing is True.')
+                    else:
+                        label = str(label)
+                instance = self.text_to_instance(text=text, label=label)
+                if instance is not None:
+                    yield instance
+    def _truncate(self, tokens):
+        if len(tokens) > self._max_sequence_length:
+            tokens = tokens[:self._max_sequence_length]
+        return tokens
+    @overrides
+    # def text_to_instance(self, text: str, label: Union[str, int] = None) -> Instance:
+    def text_to_instance(self, *inputs) -> Instance:
+        if len(inputs) == 1:
+            text = inputs[0]
+            label = None
+        elif len(inputs) == 2:
+            text, label = inputs
+        else:
+            raise AssertionError
+        fields: Dict[str, Field] = {}
+        if self._segment_sentences:
+            sentences: List[Field] = []
+            sentence_splits = self._sentence_segmenter.split_sentences(text)
+            for sentence in sentence_splits:
+                word_tokens = self._tokenizer.tokenize(sentence)
+                if self._max_sequence_length is not None:
+                    word_tokens = self._truncate(word_tokens)
+                sentences.append(TextField(word_tokens, self._token_indexers))
+            fields['tokens'] = ListField(sentences)
+        else:
+            tokens = self._tokenizer.tokenize(text)
+            if self._max_sequence_length is not None:
+                tokens = self._truncate(tokens)
+            fields['tokens'] = TextField(tokens, self._token_indexers)
+        if label is not None:
+            fields['label'] = LabelField(label,
+                                         skip_indexing=self._skip_label_indexing)
+        return Instance(fields)

toolbox/allennlp_models/text_classifier/models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/allennlp_models/text_classifier/models/hierarchical_text_classifier.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from collections import OrderedDict
+import pickle
+from typing import Dict, Optional
+from overrides import overrides
+import torch
+import torch.nn as nn
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask
+from allennlp.training.metrics import CategoricalAccuracy
+from toolbox.torch.modules.loss import FocalLoss, NegativeEntropy
+@Model.register("hierarchical_classifier")
+class HierarchicalClassifier(Model):
+    def __init__(self,
+                 vocab: Vocabulary,
+                 hierarchical_labels_pkl: str,
+                 text_field_embedder: TextFieldEmbedder,
+                 seq2vec_encoder: Seq2VecEncoder,
+                 seq2seq_encoder: Seq2SeqEncoder = None,
+                 dropout: float = None,
+                 num_labels: int = None,
+                 label_namespace: str = "labels",
+                 balance_probs: bool = False,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super().__init__(vocab, regularizer)
+        self._hierarchical_labels_pkl = hierarchical_labels_pkl
+        self._text_field_embedder = text_field_embedder
+        if seq2seq_encoder:
+            self._seq2seq_encoder = seq2seq_encoder
+        else:
+            self._seq2seq_encoder = None
+        self._seq2vec_encoder = seq2vec_encoder
+        self._classifier_input_dim = self._seq2vec_encoder.get_output_dim()
+        if dropout:
+            self._dropout = torch.nn.Dropout(dropout)
+        else:
+            self._dropout = None
+        self._label_namespace = label_namespace
+        if num_labels:
+            self._num_labels = num_labels
+        else:
+            self._num_labels = vocab.get_vocab_size(namespace=self._label_namespace)
+        with open(self._hierarchical_labels_pkl, 'rb') as f:
+            hierarchical_labels = pickle.load(f)
+        self._classification_layer = HierarchicalSoftMaxClassificationLayer(
+            classifier_input_dim=self._classifier_input_dim,
+            hierarchical_labels=hierarchical_labels,
+            activation='softmax',
+        )
+        self._accuracy = CategoricalAccuracy()
+        if balance_probs:
+            self._loss = NegativeEntropy(
+                inputs_logits=False,
+            )
+        else:
+            self._loss = FocalLoss(
+                num_classes=self._num_labels,
+                inputs_logits=False,
+            )
+        initializer(self)
+    def forward(self,  # type: ignore
+                tokens: Dict[str, torch.LongTensor],
+                label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
+        embedded_text = self._text_field_embedder(tokens)
+        mask = get_text_field_mask(tokens)
+        if self._seq2seq_encoder:
+            embedded_text = self._seq2seq_encoder(embedded_text, mask=mask)
+        embedded_text = self._seq2vec_encoder(embedded_text, mask=mask)
+        if self._dropout:
+            embedded_text = self._dropout(embedded_text)
+        probs = self._classification_layer(embedded_text)
+        output_dict = {"probs": probs}
+        if label is not None:
+            loss = self._loss(probs, label.long().view(-1))
+            output_dict["loss"] = loss
+            self._accuracy(probs, label)
+        return output_dict
+    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        predictions = output_dict["probs"]
+        predictions = torch.tensor(predictions)
+        if predictions.dim() == 2:
+            predictions_list = [predictions[i] for i in range(predictions.shape[0])]
+        else:
+            predictions_list = [predictions]
+        classes = list()
+        prob = list()
+        for prediction in predictions_list:
+            label_idx = prediction.argmax(dim=-1).item()
+            label_str = self.vocab.get_index_to_token_vocabulary(self._label_namespace).get(label_idx, str(label_idx))
+            classes.append(label_str)
+            prob.append(prediction[label_idx].item())
+        output_dict["label"] = classes
+        output_dict["prob"] = prob
+        return output_dict
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics = {'accuracy': self._accuracy.get_metric(reset)}
+        return metrics
+class HierarchicalSoftMaxClassificationLayer(nn.Module):
+    """多层 softmax 实现多极文本分类
+    由于初始化时, 各层 softmax 的概率趋于平衡.
+    因此在第一层时 `领域无关` 就分到了 50% 的概率.
+    `领域相关` 中的各类别去分剩下的 50% 的概率.
+    这会导致模型一开始时输出的类别全是 `领域无关`, 这导致模型无法优化.
+    解决方��:
+    1. 从数据集中去除 `领域无关` 数据. 并训练模型.
+    2. 等模型收敛之后, 再使用包含 `领域无关` 的数据集, 让模型加载之前的权重, 并重新开始训练模型.
+    """
+    @staticmethod
+    def demo1():
+        # hierarchical_labels = OrderedDict({
+        #     '领域相关': OrderedDict({
+        #         '肯定答复': [
+        #             '肯定(好的)', '肯定(可以)', '肯定(正确)'
+        #         ],
+        #         '否定答复': [
+        #             '否定(不可以)', '否定(不知道)', '否定(错误)'
+        #         ],
+        #         '用户正忙': [
+        #             '用户正忙'
+        #         ]
+        #     }),
+        #     '领域无关': OrderedDict({
+        #         '领域无关': [
+        #             '领域无关'
+        #         ]
+        #     })
+        # })
+        hierarchical_labels = OrderedDict({
+            '领域相关': ['肯定答复', '否定答复', '用户正忙', '查联系方式'],
+            '领域无关': ['领域无关'],
+        })
+        softmax_layer = HierarchicalSoftMaxClassificationLayer(
+            classifier_input_dim=3,
+            hierarchical_labels=hierarchical_labels,
+            activation='softmax',
+            # activation='sigmoid',
+        )
+        for k, v in softmax_layer.__dict__['_modules'].items():
+            print(k)
+            print(v)
+        inputs = torch.ones(size=(2, 3), dtype=torch.float32)
+        probs = softmax_layer.forward(inputs)
+        print(probs)
+        print(torch.sum(probs, dim=-1))
+        return
+    def __init__(self, classifier_input_dim: int, hierarchical_labels: OrderedDict, activation: str = 'softmax'):
+        super(HierarchicalSoftMaxClassificationLayer, self).__init__()
+        self.classifier_input_dim = classifier_input_dim
+        self.hierarchical_labels = hierarchical_labels
+        self.activation: str = activation
+        self._init_hierarchical_classification_layer(hierarchical_labels)
+    def _init_hierarchical_classification_layer(self,
+                                                hierarchical_labels: OrderedDict,
+                                                key: str = 'classification_layer',
+                                                child_class: str = None):
+        num_labels = len(hierarchical_labels)
+        classification_layer = torch.nn.Linear(self.classifier_input_dim, num_labels)
+        if child_class is not None:
+            key = '{header}_{child_class}'.format(header=key, child_class=child_class)
+        setattr(
+            self,
+            key,
+            classification_layer
+        )
+        branch = 0
+        for k, v in hierarchical_labels.items():
+            if isinstance(v, OrderedDict):
+                self._init_hierarchical_classification_layer(
+                    v,
+                    key=key,
+                    child_class=branch,
+                )
+            elif isinstance(v, list):
+                num_labels = len(v)
+                classification_layer = torch.nn.Linear(self.classifier_input_dim, num_labels)
+                setattr(
+                    self,
+                    '{key}_{child_class}'.format(key=key, child_class=branch),
+                    classification_layer,
+                )
+            else:
+                raise NotImplementedError
+            branch += 1
+        return
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        key = 'classification_layer'
+        classification_layer = getattr(self, key)
+        logits = classification_layer.forward(inputs)
+        probs = torch.softmax(logits, dim=-1)
+        probs = self._layer_probs(
+            inputs=inputs,
+            probs=probs,
+            key=key,
+        )
+        return probs
+    def _layer_probs(self,
+                     inputs: torch.Tensor,
+                     probs: torch.Tensor,
+                     key: str,
+                     ):
+        result = list()
+        for child_class in range(probs.shape[1]):
+            parent_probs = torch.unsqueeze(probs[:, child_class], dim=-1)
+            child_key = '{key}_{child_class}'.format(key=key, child_class=child_class)
+            classification_layer = getattr(self, child_key)
+            logits = classification_layer.forward(inputs)
+            child_child_key = '{key}_{child_class}'.format(key=child_key, child_class=0)
+            if hasattr(self, child_child_key):
+                child_probs = torch.softmax(logits, dim=-1)
+                child_probs = child_probs * parent_probs
+                child_probs = self._layer_probs(
+                    inputs=inputs,
+                    probs=child_probs,
+                    key=child_key,
+                )
+            else:
+                if self.activation == 'softmax':
+                    child_probs = torch.softmax(logits, dim=-1)
+                else:
+                    child_probs = torch.sigmoid(logits)
+                child_probs = child_probs * parent_probs
+            result.append(child_probs)
+        result = torch.concat(result, dim=-1)
+        return result
+def demo1():
+    HierarchicalSoftMaxClassificationLayer.demo1()
+    return
+if __name__ == '__main__':
+    demo1()

toolbox/torch/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torch/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torch/modules/loss.py ADDED Viewed

	@@ -0,0 +1,738 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import math
+from typing import List, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+from torch.autograd import Variable
+class ClassBalancedLoss(_Loss):
+    """
+    https://arxiv.org/abs/1901.05555
+    """
+    @staticmethod
+    def demo1():
+        batch_loss: torch.FloatTensor = torch.randn(size=(2, 1), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+        class_balanced_loss = ClassBalancedLoss(
+            num_classes=3,
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = class_balanced_loss.forward(batch_loss=batch_loss, targets=targets)
+        print(loss)
+        return
+    @staticmethod
+    def demo2():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+        focal_loss = FocalLoss(
+            num_classes=3,
+            # reduction='mean',
+            # reduction='sum',
+            reduction='none',
+        )
+        batch_loss = focal_loss.forward(inputs, targets)
+        print(batch_loss)
+        class_balanced_loss = ClassBalancedLoss(
+            num_classes=3,
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = class_balanced_loss.forward(batch_loss=batch_loss, targets=targets)
+        print(loss)
+        return
+    def __init__(self,
+                 num_classes: int,
+                 num_samples_each_class: List[int],
+                 beta: float = 0.999,
+                 reduction: str = 'mean') -> None:
+        super(ClassBalancedLoss, self).__init__(None, None, reduction)
+        effective_num = 1.0 - np.power(beta, num_samples_each_class)
+        weights = (1.0 - beta) / np.array(effective_num)
+        self.weights = weights / np.sum(weights) * num_classes
+    def forward(self, batch_loss: torch.FloatTensor, targets: torch.LongTensor):
+        """
+        :param batch_loss: shape=[batch_size, 1]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        weights = list()
+        targets = targets.numpy()
+        for target in targets:
+            weights.append([self.weights[target]])
+        weights = torch.tensor(weights, dtype=torch.float32)
+        batch_loss = weights * batch_loss
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+        return loss
+class EqualizationLoss(_Loss):
+    """
+    在图像识别中的, sigmoid 的多标签分类, 且 num_classes 类别数之外有一个 background 背景类别.
+    Equalization Loss
+    https://arxiv.org/abs/2003.05176
+    Equalization Loss v2
+    https://arxiv.org/abs/2012.08548
+    """
+    @staticmethod
+    def demo1():
+        logits: torch.FloatTensor = torch.randn(size=(3, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2, 3], dtype=torch.long)
+        equalization_loss = EqualizationLoss(
+            num_samples_each_class=[300, 433, 50],
+            threshold=100,
+            reduction='mean',
+        )
+        loss = equalization_loss.forward(logits=logits, targets=targets)
+        print(loss)
+        return
+    def __init__(self,
+                 num_samples_each_class: List[int],
+                 threshold: int = 100,
+                 reduction: str = 'mean') -> None:
+        super(EqualizationLoss, self).__init__(None, None, reduction)
+        self.num_samples_each_class = np.array(num_samples_each_class, dtype=np.int32)
+        self.threshold = threshold
+    def forward(self,
+                logits: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        num_classes + 1 对应于背景类别 background.
+        :param logits: shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size]
+        :return:
+        """
+        batch_size, num_classes = logits.size()
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes + 1)
+        one_hot_targets = one_hot_targets[:, :-1]
+        exclude = self.exclude_func(
+            num_classes=num_classes,
+            targets=targets
+        )
+        is_tail = self.threshold_func(
+            num_classes=num_classes,
+            num_samples_each_class=self.num_samples_each_class,
+            threshold=self.threshold,
+        )
+        weights = 1 - exclude * is_tail * (1 - one_hot_targets)
+        batch_loss = F.binary_cross_entropy_with_logits(
+            logits,
+            one_hot_targets.float(),
+            reduction='none'
+        )
+        batch_loss = weights * batch_loss
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+        loss = loss / num_classes
+        return loss
+    @staticmethod
+    def exclude_func(num_classes: int, targets: torch.LongTensor):
+        """
+        最后一个类别是背景 background.
+        :param num_classes: int,
+        :param targets: shape=[batch_size,]
+        :return: weight, shape=[batch_size, num_classes]
+        """
+        batch_size = targets.shape[0]
+        weight = (targets != num_classes).float()
+        weight = weight.view(batch_size, 1).expand(batch_size, num_classes)
+        return weight
+    @staticmethod
+    def threshold_func(num_classes: int, num_samples_each_class: np.ndarray, threshold: int):
+        """
+        :param num_classes: int,
+        :param num_samples_each_class: shape=[num_classes]
+        :param threshold: int,
+        :return: weight, shape=[1, num_classes]
+        """
+        weight = torch.zeros(size=(num_classes,))
+        weight[num_samples_each_class < threshold] = 1
+        weight = torch.unsqueeze(weight, dim=0)
+        return weight
+class FocalLoss(_Loss):
+    """
+    https://arxiv.org/abs/1708.02002
+    """
+    @staticmethod
+    def demo1(self):
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+        focal_loss = FocalLoss(
+            num_classes=3,
+            reduction='mean',
+            # reduction='sum',
+            # reduction='none',
+        )
+        loss = focal_loss.forward(inputs, targets)
+        print(loss)
+        return
+    def __init__(self,
+                 num_classes: int,
+                 alpha: List[float] = None,
+                 gamma: int = 2,
+                 reduction: str = 'mean',
+                 inputs_logits: bool = True) -> None:
+        """
+        :param num_classes:
+        :param alpha:
+        :param gamma:
+        :param reduction: (`none`, `mean`, `sum`) available.
+        :param inputs_logits: if False, the inputs should be probs.
+        """
+        super(FocalLoss, self).__init__(None, None, reduction)
+        if alpha is None:
+            self.alpha = torch.ones(num_classes, 1)
+        else:
+            self.alpha = torch.tensor(alpha, dtype=torch.float32)
+        self.gamma = gamma
+        self.num_classes = num_classes
+        self.inputs_logits = inputs_logits
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+        if self.inputs_logits:
+            probs = F.softmax(inputs, dim=-1)
+        else:
+            probs = inputs
+        # class_mask = inputs.data.new(batch_size, num_classes).fill_(0)
+        class_mask = torch.zeros(size=(batch_size, num_classes), dtype=inputs.dtype, device=inputs.device)
+        # class_mask = Variable(class_mask)
+        ids = targets.view(-1, 1)
+        class_mask.scatter_(1, ids.data, 1.)
+        if inputs.is_cuda and not self.alpha.is_cuda:
+            self.alpha = self.alpha.cuda()
+        alpha = self.alpha[ids.data.view(-1)]
+        probs = (probs * class_mask).sum(1).view(-1, 1)
+        log_p = probs.log()
+        batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
+        if self.reduction == 'mean':
+            loss = batch_loss.mean()
+        elif self.reduction == 'sum':
+            loss = batch_loss.sum()
+        else:
+            loss = batch_loss
+        return loss
+class HingeLoss(_Loss):
+    @staticmethod
+    def demo1():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+        hinge_loss = HingeLoss(
+            margin_list=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = hinge_loss.forward(inputs=inputs, targets=targets)
+        print(loss)
+        return
+    def __init__(self,
+                 margin_list: List[float],
+                 max_margin: float = 0.5,
+                 scale: float = 1.0,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean') -> None:
+        super(HingeLoss, self).__init__(None, None, reduction)
+        self.max_margin = max_margin
+        self.scale = scale
+        self.weight = weight
+        margin_list = np.array(margin_list)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+        margin_list = torch.unsqueeze(self.margin_list, dim=0)
+        batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+        batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+        inputs_margin = inputs - batch_margin
+        # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+        logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+        loss = F.cross_entropy(
+            input=self.scale * logits,
+            target=targets,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
+        return loss
+class HingeLinear(nn.Module):
+    """
+    use this instead of `HingeLoss`, then you can combine it with `FocalLoss` or others.
+    """
+    def __init__(self,
+                 margin_list: List[float],
+                 max_margin: float = 0.5,
+                 scale: float = 1.0,
+                 weight: Optional[torch.Tensor] = None
+                 ) -> None:
+        super(HingeLinear, self).__init__()
+        self.max_margin = max_margin
+        self.scale = scale
+        self.weight = weight
+        margin_list = np.array(margin_list)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        if self.training and targets is not None:
+            batch_size, num_classes = inputs.shape
+            one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+            margin_list = torch.unsqueeze(self.margin_list, dim=0)
+            batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+            batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+            inputs_margin = inputs - batch_margin
+            # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+            logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+            logits = logits * self.scale
+        else:
+            logits = inputs
+        return logits
+class LDAMLoss(_Loss):
+    """
+    https://arxiv.org/abs/1906.07413
+    """
+    @staticmethod
+    def demo1():
+        inputs: torch.FloatTensor = torch.randn(size=(2, 3), dtype=torch.float32)
+        targets: torch.LongTensor = torch.tensor([1, 2], dtype=torch.long)
+        ldam_loss = LDAMLoss(
+            num_samples_each_class=[300, 433, 50],
+            reduction='mean',
+        )
+        loss = ldam_loss.forward(inputs=inputs, targets=targets)
+        print(loss)
+        return
+    def __init__(self,
+                 num_samples_each_class: List[int],
+                 max_margin: float = 0.5,
+                 scale: float = 30.0,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean') -> None:
+        super(LDAMLoss, self).__init__(None, None, reduction)
+        margin_list = np.power(num_samples_each_class, -0.25)
+        margin_list = margin_list * (max_margin / np.max(margin_list))
+        self.num_samples_each_class = num_samples_each_class
+        self.margin_list = torch.tensor(margin_list, dtype=torch.float32)
+        self.scale = scale
+        self.weight = weight
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor
+                ):
+        """
+        :param inputs: logits, shape=[batch_size, num_classes]
+        :param targets: shape=[batch_size,]
+        :return:
+        """
+        batch_size, num_classes = inputs.shape
+        one_hot_targets = F.one_hot(targets, num_classes=num_classes)
+        margin_list = torch.unsqueeze(self.margin_list, dim=0)
+        batch_margin = torch.sum(margin_list * one_hot_targets, dim=-1)
+        batch_margin = torch.unsqueeze(batch_margin, dim=-1)
+        inputs_margin = inputs - batch_margin
+        # 将类别对应的 logits 值减小一点, 以形成 margin 边界.
+        logits = torch.where(one_hot_targets > 0, inputs_margin, inputs)
+        loss = F.cross_entropy(
+            input=self.scale * logits,
+            target=targets,
+            weight=self.weight,
+            reduction=self.reduction,
+        )
+        return loss
+class NegativeEntropy(_Loss):
+    def __init__(self,
+                 reduction: str = 'mean',
+                 inputs_logits: bool = True) -> None:
+        super(NegativeEntropy, self).__init__(None, None, reduction)
+        self.inputs_logits = inputs_logits
+    def forward(self,
+                inputs: torch.FloatTensor,
+                targets: torch.LongTensor):
+        if self.inputs_logits:
+            probs = F.softmax(inputs, dim=-1)
+            log_probs = torch.nn.functional.log_softmax(probs, dim=-1)
+        else:
+            probs = inputs
+            log_probs = torch.log(probs)
+        weighted_negative_likelihood = - log_probs * probs
+        loss = - weighted_negative_likelihood.sum()
+        return loss
+class LargeMarginSoftMaxLoss(_Loss):
+    """
+    Alias: L-Softmax
+    https://arxiv.org/abs/1612.02295
+    https://github.com/wy1iu/LargeMargin_Softmax_Loss
+    https://github.com/amirhfarzaneh/lsoftmax-pytorch/blob/master/lsoftmax.py
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+    论文认为, softmax 和 cross entropy 的组合, 没有明确鼓励对特征进行判别学习.
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(LargeMarginSoftMaxLoss, self).__init__(None, None, reduction)
+class AngularSoftMaxLoss(_Loss):
+    """
+    Alias: A-Softmax
+    https://arxiv.org/abs/1704.08063
+    https://github.com/woshildh/a-softmax_pytorch/blob/master/a_softmax.py
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+    好像作者认为人脸是一个球面, 所以将向量转换到一个球面上是有帮助的.
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(AngularSoftMaxLoss, self).__init__(None, None, reduction)
+class AdditiveMarginSoftMax(_Loss):
+    """
+    Alias: AM-Softmax
+    https://arxiv.org/abs/1801.05599
+    Large Margin Cosine Loss
+    https://arxiv.org/abs/1801.09414
+    参考链接:
+    https://www.jianshu.com/p/06cc3f84aa85
+    说明:
+    相对于普通的 对 logits 做 softmax,
+    它将真实标签对应的 logit 值减去 m, 来让模型它该值调整得更大一些.
+    另外, 它还将每个 logits 乘以 s, 这可以控制各 logits 之间的相对大小.
+    根 HingeLoss 有点像.
+    """
+    def __init__(self,
+                 reduction: str = 'mean') -> None:
+        super(AdditiveMarginSoftMax, self).__init__(None, None, reduction)
+class AdditiveAngularMarginSoftMax(_Loss):
+    """
+    Alias: ArcFace, AAM-Softmax
+    ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    https://arxiv.org/abs/1801.07698
+    参考代码:
+    https://github.com/huangkeju/AAMSoftmax-OpenMax/blob/main/AAMSoftmax%2BOvA/metrics.py
+    """
+    @staticmethod
+    def demo1():
+        """
+        角度与数值转换
+        pi / 180 代表 1 度,
+        pi / 180 = 0.01745
+        """
+        # 度数转数值
+        degree = 10
+        result = degree * math.pi / 180
+        print(result)
+        # 数值转数度
+        radian = 0.2
+        result = radian / (math.pi / 180)
+        print(result)
+        return
+    def __init__(self,
+                 hidden_size: int,
+                 num_labels: int,
+                 margin: float = 0.2,
+                 scale: float = 10.0,
+                 ):
+        """
+        :param hidden_size:
+        :param num_labels:
+        :param margin: 建议取值角度为 [10, 30], 对应的数值为 [0.1745, 0.5236]
+        :param scale:
+        """
+        super(AdditiveAngularMarginSoftMax, self).__init__()
+        self.margin = margin
+        self.scale = scale
+        self.weight = torch.nn.Parameter(torch.FloatTensor(num_labels, hidden_size), requires_grad=True)
+        nn.init.xavier_uniform_(self.weight)
+        self.cos_margin = math.cos(self.margin)
+        self.sin_margin = math.sin(self.margin)
+        # sin(a-b) = sin(a)cos(b) - cos(a)sin(b)
+        # sin(pi - a) = sin(a)
+        self.loss = nn.CrossEntropyLoss()
+    def forward(self,
+                inputs: torch.Tensor,
+                label: torch.LongTensor = None
+                ):
+        """
+        :param inputs: shape=[batch_size, ..., hidden_size]
+        :param label:
+        :return: logits
+        """
+        x = F.normalize(inputs)
+        weight = F.normalize(self.weight)
+        cosine = F.linear(x, weight)
+        if self.training:
+            # sin^2  + cos^2 = 1
+            sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+            # cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
+            cosine_theta_margin = cosine * self.cos_margin - sine * self.sin_margin
+            # when the `cosine > - self.cos_margin` there is enough space to add margin on theta.
+            cosine_theta_margin = torch.where(cosine > - self.cos_margin, cosine_theta_margin, cosine - (self.margin * self.sin_margin))
+            one_hot = torch.zeros_like(cosine)
+            one_hot.scatter_(1, label.view(-1, 1), 1)
+            #
+            logits = torch.where(one_hot == 1, cosine_theta_margin, cosine)
+            logits = logits * self.scale
+        else:
+            logits = cosine
+        loss = self.loss(logits, label)
+        # prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0]
+        return loss
+class AdditiveAngularMarginLinear(nn.Module):
+    """
+    Alias: ArcFace, AAM-Softmax
+    ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    https://arxiv.org/abs/1801.07698
+    参考代码:
+    https://github.com/huangkeju/AAMSoftmax-OpenMax/blob/main/AAMSoftmax%2BOvA/metrics.py
+    """
+    @staticmethod
+    def demo1():
+        """
+        角度与数值转换
+        pi / 180 代表 1 度,
+        pi / 180 = 0.01745
+        """
+        # 度数转数值
+        degree = 10
+        result = degree * math.pi / 180
+        print(result)
+        # 数值转数度
+        radian = 0.2
+        result = radian / (math.pi / 180)
+        print(result)
+        return
+    @staticmethod
+    def demo2():
+        return
+    def __init__(self,
+                 hidden_size: int,
+                 num_labels: int,
+                 margin: float = 0.2,
+                 scale: float = 10.0,
+                 ):
+        """
+        :param hidden_size:
+        :param num_labels:
+        :param margin: 建议取值角度为 [10, 30], 对应的数值为 [0.1745, 0.5236]
+        :param scale:
+        """
+        super(AdditiveAngularMarginLinear, self).__init__()
+        self.margin = margin
+        self.scale = scale
+        self.weight = torch.nn.Parameter(torch.FloatTensor(num_labels, hidden_size), requires_grad=True)
+        nn.init.xavier_uniform_(self.weight)
+        self.cos_margin = math.cos(self.margin)
+        self.sin_margin = math.sin(self.margin)
+        # sin(a-b) = sin(a)cos(b) - cos(a)sin(b)
+        # sin(pi - a) = sin(a)
+    def forward(self,
+                inputs: torch.Tensor,
+                targets: torch.LongTensor = None
+                ):
+        """
+        :param inputs: shape=[batch_size, ..., hidden_size]
+        :param targets:
+        :return: logits
+        """
+        x = F.normalize(inputs)
+        weight = F.normalize(self.weight)
+        cosine = F.linear(x, weight)
+        if self.training and targets is not None:
+            # sin^2  + cos^2 = 1
+            sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+            # cos(a+b) = cos(a)cos(b) - sin(a)sin(b)
+            cosine_theta_margin = cosine * self.cos_margin - sine * self.sin_margin
+            # when the `cosine > - self.cos_margin` there is enough space to add margin on theta.
+            cosine_theta_margin = torch.where(cosine > - self.cos_margin, cosine_theta_margin, cosine - (self.margin * self.sin_margin))
+            one_hot = torch.zeros_like(cosine)
+            one_hot.scatter_(1, targets.view(-1, 1), 1)
+            logits = torch.where(one_hot == 1, cosine_theta_margin, cosine)
+            logits = logits * self.scale
+        else:
+            logits = cosine
+        return logits
+def demo1():
+    HingeLoss.demo1()
+    return
+def demo2():
+    AdditiveAngularMarginSoftMax.demo1()
+    inputs = torch.ones(size=(2, 5), dtype=torch.float32)
+    label: torch.LongTensor = torch.tensor(data=[0, 1], dtype=torch.long)
+    aam_softmax = AdditiveAngularMarginSoftMax(
+        hidden_size=5,
+        num_labels=2,
+        margin=1,
+        scale=1
+    )
+    outputs = aam_softmax.forward(inputs, label)
+    print(outputs)
+    return
+if __name__ == '__main__':
+    # demo1()
+    demo2()