InfosysResponsibleAiToolKit commited on
Commit
a5eaebe
·
1 Parent(s): cc271f7
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +5 -0
  2. Kubernetes/responsible-ai-moderation-model.yaml +62 -0
  3. LICENSE.txt +9 -0
  4. README.md +166 -0
  5. execreation.bat +12 -0
  6. lib/privacy-2.0.8-py3-none-any.whl +3 -0
  7. models/bancode/config.json +34 -0
  8. models/bancode/special_tokens_map.json +7 -0
  9. models/bancode/tokenizer.json +0 -0
  10. models/bancode/tokenizer_config.json +15 -0
  11. models/bancode/vocab.txt +0 -0
  12. models/dbertaInjection/added_tokens.json +3 -0
  13. models/dbertaInjection/config.json +43 -0
  14. models/dbertaInjection/gitattributes.txt +34 -0
  15. models/dbertaInjection/gitignore.txt +1 -0
  16. models/dbertaInjection/special_tokens_map.json +9 -0
  17. models/dbertaInjection/spm.model +3 -0
  18. models/dbertaInjection/tokenizer.json +0 -0
  19. models/dbertaInjection/tokenizer_config.json +16 -0
  20. models/dbertaInjection/training_args.bin +3 -0
  21. models/detoxify/config.json +21 -0
  22. models/detoxify/merges.txt +0 -0
  23. models/detoxify/tokenizer.json +0 -0
  24. models/detoxify/vocab.json +0 -0
  25. models/gibberish/config.json +40 -0
  26. models/gibberish/special_tokens_map.json +1 -0
  27. models/gibberish/tokenizer.json +0 -0
  28. models/gibberish/tokenizer_config.json +1 -0
  29. models/gibberish/vocab.txt +0 -0
  30. models/multi-qa-mpnet-base-dot-v1/1_Pooling/config.json +7 -0
  31. models/multi-qa-mpnet-base-dot-v1/README.md +184 -0
  32. models/multi-qa-mpnet-base-dot-v1/config.json +24 -0
  33. models/multi-qa-mpnet-base-dot-v1/config_sentence_transformers.json +7 -0
  34. models/multi-qa-mpnet-base-dot-v1/modules.json +14 -0
  35. models/multi-qa-mpnet-base-dot-v1/sentence_bert_config.json +4 -0
  36. models/multi-qa-mpnet-base-dot-v1/special_tokens_map.json +15 -0
  37. models/multi-qa-mpnet-base-dot-v1/tokenizer.json +0 -0
  38. models/multi-qa-mpnet-base-dot-v1/tokenizer_config.json +16 -0
  39. models/multi-qa-mpnet-base-dot-v1/vocab.txt +0 -0
  40. models/nli-MiniLM2-L6-H768/config.json +36 -0
  41. models/nli-MiniLM2-L6-H768/merges.txt +0 -0
  42. models/nli-MiniLM2-L6-H768/special_tokens_map.json +1 -0
  43. models/nli-MiniLM2-L6-H768/tokenizer.json +0 -0
  44. models/nli-MiniLM2-L6-H768/tokenizer_config.json +1 -0
  45. models/nli-MiniLM2-L6-H768/vocab.json +0 -0
  46. models/restricted-dberta-base-zeroshot-v2/added_tokens.json +3 -0
  47. models/restricted-dberta-base-zeroshot-v2/config.json +43 -0
  48. models/restricted-dberta-base-zeroshot-v2/special_tokens_map.json +15 -0
  49. models/restricted-dberta-base-zeroshot-v2/spm.model +3 -0
  50. models/restricted-dberta-base-zeroshot-v2/tokenizer.json +0 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.pyc
2
+ *39.pyc
3
+ myenv/
4
+ build/
5
+ dist/
Kubernetes/responsible-ai-moderation-model.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: v1
2
+ kind: Service
3
+ metadata:
4
+ name: responsible-ai-moderation-model
5
+ namespace: irai-toolkit-test
6
+ labels:
7
+ app: responsible-ai-moderation-model
8
+ spec:
9
+ type: ClusterIP
10
+ ports:
11
+ - port: 8000
12
+ selector:
13
+ app: responsible-ai-moderation-model
14
+ ---
15
+ apiVersion: apps/v1
16
+ kind: Deployment
17
+ seccompProfile:
18
+ type: "RuntimeDefault"
19
+ automountServiceAccountToken: false
20
+ metadata:
21
+ name: responsible-ai-moderation-model
22
+ namespace: irai-toolkit-test
23
+ labels:
24
+ app: responsible-ai-moderation-model
25
+ version: v1
26
+ spec:
27
+ replicas: 1
28
+ selector:
29
+ matchLabels:
30
+ app: responsible-ai-moderation-model
31
+ version: v1
32
+ template:
33
+ metadata:
34
+ labels:
35
+ app: responsible-ai-moderation-model
36
+ version: v1
37
+ spec:
38
+ automountServiceAccountToken: false # Disable token mounting
39
+ imagePullSecrets:
40
+ - name: docker-secret
41
+ containers:
42
+ - envFrom:
43
+ - configMapRef:
44
+ name: privacy-config
45
+ image: <Image Name>
46
+ imagePullPolicy: Always
47
+ name: responsible-ai-privacy
48
+ ports:
49
+ - containerPort: 8000
50
+ securityContext:
51
+ runAsUser: 1000 # Non-root user
52
+ runAsGroup: 1000
53
+ capabilities:
54
+ drop:
55
+ - ALL # Drop all capabilities
56
+ seccompProfile:
57
+ type: RuntimeDefault
58
+ resources:
59
+ limits:
60
+ #cpu: '2'
61
+ memory: '8Gi'
62
+ nvidia.com/gpu: 1
LICENSE.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2024-2025 Infosys Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Responsible-AI-Moderation Model
2
+
3
+ ## Table of Contents
4
+
5
+ - [Introduction](#introduction)
6
+ - [Features](#features)
7
+ - [Installation](#installation)
8
+ - [Set Configuration Variables](#set-configuration-variables)
9
+ - [Models Required](#models-required)
10
+ - [Running the Application](#running-the-application)
11
+ - [Docker Image](#Docker-Image)
12
+ - [License](#license)
13
+ - [Contact](#contact)
14
+
15
+ ## Introduction
16
+ The **Moderation Model** module acts as a central hub for machine learning models for prompt injection, toxicity, jailbreak, restricted topic, custom theme and refusal checks. It provides the endpoints to utilize the response generated by these models.
17
+
18
+ ## Features
19
+ The **Moderation Model** module acts as a wrapper for the traditional AI models we are using for various checks like prompt injection, jailbreak, toxicity etc.
20
+
21
+ ## Installation
22
+ To run the application, first we need to install Python and the necessary packages:
23
+
24
+ 1. Install Python (version 3.11.x) from the [official website](https://www.python.org/downloads/) and ensure it is added to your system PATH.
25
+
26
+ 2. Clone the repository : responsible-ai-mm-flask:
27
+ ```sh
28
+ git clone <repository-url>
29
+ ```
30
+
31
+ 3. Navigate to the `responsible-ai-mm-flask` directory:
32
+ ```sh
33
+ cd responsible-ai-mm-flask
34
+ ```
35
+
36
+ 4. Create a virtual environment:
37
+ ```sh
38
+ python -m venv venv
39
+ ```
40
+
41
+ 5. Activate the virtual environment:
42
+ - On Windows:
43
+ ```sh
44
+ .\venv\Scripts\activate
45
+ ```
46
+
47
+ 6. Go to the `requirements` directory where the `requirement.txt` file is present.
48
+ In the `requirement.txt` file comment the
49
+ ```sh
50
+ lib/torch-2.2.0+cu118-cp39-cp39-linux_x86_64.whl
51
+ ```
52
+ **Note:** Download appropriate torch version supporting python version which is installed [i.e if Python version is 3.10 use torch-2.2.0+cu118-**cp310**-**cp310**-**linux**_x86_64.whl, where cp310 denotes python version 3.10 and linux denotes OS which can be linux/win and **_not applicable for Mac_**]
53
+
54
+ **Note:** If working in windows as this is for linux and replace
55
+ ```sh
56
+ lib/
57
+ ```
58
+ with
59
+ ```sh
60
+ ../lib/
61
+ ```
62
+ **Note:** If working in Mac Os, run the below command after running requirement.txt
63
+ ```sh
64
+ pip install --pre torch torchvision torchaudio \--extra-index-url https://download.pytorch.org/whl/nightly/cpu
65
+ ```
66
+
67
+ Download and place the en_core_web_lg-3.5.0-py3-none-any.whl inside the lib folder.
68
+ [en_core_web_lg](https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl) and install the requirements:
69
+
70
+ ```sh
71
+ pip install -r requirement.txt
72
+ ```
73
+
74
+ **Note:** when running requirement.txt, if getting error related to "cuda-python" then comment cuda-python from
75
+ requirement.txt file and run pip install again
76
+ Install the fastapi library as well, use the following command:
77
+ ```sh
78
+ pip install fastapi
79
+ ```
80
+ ## Set Configuration Variables
81
+ After installing all the required packages, configure the variables necessary to run the APIs.
82
+
83
+ 1. Navigate to the `src` directory:
84
+ ```sh
85
+ cd ..
86
+ ```
87
+
88
+ 2. Locate the `.env` file, which contains keys like the following:
89
+
90
+ ```sh
91
+ workers=1
92
+ WORKERS="${workers}"
93
+ # DB_NAME="${dbname}"
94
+ # DB_USERNAME="${username}"
95
+ # DB_PWD="${password}"
96
+ # DB_IP="${ipaddress}"
97
+ # DB_PORT="${port}"
98
+ # MONGO_PATH="mongodb://${DB_USERNAME}:${DB_PWD}@${DB_IP}:${DB_PORT}/"
99
+ # MONGO_PATH= "mongodb://localhost:27017/"
100
+ ```
101
+
102
+ 3. Replace the placeholders with your actual values.
103
+
104
+ ## Models Required
105
+ The following models are required to run the application. Download all the model files from the links provided, and place it in the folder name provided.
106
+
107
+ 1. [Prompt Injection](https://huggingface.co/deepset/deberta-v3-base-injection/tree/main)
108
+ Files required to download here are : model.safetensors, config.json, tokenizer_config.json, tokenizer.json, special_tokens_map.json.
109
+ Name the folder as 'dbertaInjection'.
110
+
111
+ 2. [Restricted Topic](https://huggingface.co/MoritzLaurer/deberta-v3-base-zeroshot-v2.0/tree/main)
112
+ Files required to download here are : model.safetensors, added_tokens.json, config.json, special_tokens_map.json, spm.model, tokenizer.json, tokenizer_config.json.
113
+ Name the folder as 'restricted-dberta-base-zeroshot-v2'.
114
+
115
+ 3. [Sentence Transformer Model](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/tree/main)
116
+ Files required to download here are : 1_Pooling folder, pytorch_model.bin, vocab.txt, tokenizer.json, tokenizer_config.json, special_tokens_map.json, sentence_bert_config.json, modules.json, config.json, config_sentence_transformers.json.
117
+ Name the folder as 'multi-qa-mpnet-base-dot-v1'.
118
+
119
+ 4. [Detoxify](https://huggingface.co/FacebookAI/roberta-base/tree/main)
120
+ Files required to download here are : vocab.json, tokenizer.json, merges.txt, config.json.
121
+ Now download the model checkpoint file from this url and keep it under this folder -
122
+ [toxic_model_ckpt_file](https://github.com/unitaryai/detoxify/releases/download/v0.3-alpha/toxic_debiased-c7548aa0.ckpt)
123
+ Name the folder as 'detoxify'.
124
+
125
+ 5. [Gibberish](https://huggingface.co/madhurjindal/autonlp-Gibberish-Detector-492513457)
126
+ Files required to download here are : vocab.json, tokenizer.json,config.json,pytorch_model.bin, tokenizer_config.json,special_tokens_map.json.
127
+ Name the folder as 'gibberish'.
128
+
129
+ 6. [Bancode](https://huggingface.co/vishnun/codenlbert-tiny)
130
+ Files required to download here are : vocab.txt, tokenizer.json,config.json,pytorch_model.bin, tokenizer_config.json,special_tokens_map.json.
131
+ Name the folder as 'bancode'.
132
+
133
+ 7. [Restricted Topic](https://huggingface.co/cross-encoder/nli-MiniLM2-L6-H768)
134
+ Files required to download here are : vocab.json, tokenizer.json,config.json,merges.txt,pytorch_model.bin, tokenizer_config.json,special_tokens_map.json.
135
+ Name the folder as 'nli-MiniLM2-L6-H768'.
136
+
137
+ Place the above folders in a folder named 'models' in the following way: 'responsible-ai-mm-flask/models'.
138
+
139
+ ## Running the Application
140
+ Once we have completed all the aforementioned steps, we can start the service.
141
+
142
+ 1. Navigate to the `src` directory:
143
+
144
+ 2. Run `main_MM.py` file:
145
+ ```sh
146
+ python main_MM.py
147
+ ```
148
+
149
+ 3. PORT_NO : Use the Port No that is configured in .env file.
150
+
151
+ Open the following URL in your browser:
152
+ `http://localhost:8000/rai/v1/raimoderationmodels/docs`
153
+
154
+ **Note:** :
155
+ 1. To address the issue where the Passport Number is not recognized in Privacy, modify the "piiEntitiesToBeRedacted" field in the privacy() under service.py file (line no: 98) from None to an empty list []. This adjustment ensures that the Passport Number is correctly identified.
156
+
157
+ 2. Do not use this Moderation Model repository as a standalone repository. It serves as the base or dependency for the Moderation Layer repository, which provides the 'Guardrail' functionality, so access this repository APIs through Moderation layer.
158
+
159
+ ## Docker Image
160
+ The Docker image for the ModerationModel module has been published on Docker Hub. You can access it here: [ModerationModel image](https://hub.docker.com/repository/docker/infosysresponsibleaitoolkit/responsible-ai-moderationmodel)
161
+
162
+ ## License
163
+ The source code for the project is licensed under the MIT license, which you can find in the [LICENSE.txt](LICENSE.txt) file.
164
+
165
+ ## Contact
166
+ If you have more questions or need further insights please feel free to connect with us @ Infosysraitoolkit@infosys.com
execreation.bat ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ echo started exe creation!! Please wait....It might take an hour depending on computation speed
3
+ set /p site_packages="Enter the path to the site-packages directory: "
4
+ cd %~dp0
5
+ echo %~dp0
6
+ powershell -Command "(Get-Content src\main.py) | ForEach-Object { if ($_ -match 'from routing\.safety_router import img_router|app\.register_blueprint\(img_router,url_prefix=''\/rai\/v1\/raimoderationmodels''\)') { '#'+ $_ } else { $_ } } | Set-Content src\main.py"
7
+ pyinstaller --add-data "src\logger.ini;." --add-data "models;models" --add-data "src\static;src" --add-data "%site_packages%;." --add-data "data;." --hidden-import=transformers src\main.py
8
+ if ERRORLEVEL 1 (
9
+ echo PyInstaller encountered an error. Check the output above for details.
10
+ ) else (
11
+ echo Exe creation was successful!
12
+ )
lib/privacy-2.0.8-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c22356831d15244ed329527f3b2a0683e134c90796b74bf20a0151fcdb3705d
3
+ size 129207
models/bancode/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "prajjwal1/bert-tiny",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 128,
11
+ "id2label": {
12
+ "0": "CODE",
13
+ "1": "NL"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 512,
17
+ "label2id": {
18
+ "CODE": 0,
19
+ "NL": 1
20
+ },
21
+ "layer_norm_eps": 1e-12,
22
+ "max_position_embeddings": 512,
23
+ "model_type": "bert",
24
+ "num_attention_heads": 2,
25
+ "num_hidden_layers": 2,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.30.2",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 30522
34
+ }
models/bancode/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
models/bancode/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/bancode/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
models/bancode/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/dbertaInjection/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
models/dbertaInjection/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-base",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "id2label": {
11
+ "0": "LEGIT",
12
+ "1": "INJECTION"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "label2id": {
17
+ "INJECTION": 1,
18
+ "LEGIT": 0
19
+ },
20
+ "layer_norm_eps": 1e-07,
21
+ "max_position_embeddings": 512,
22
+ "max_relative_positions": -1,
23
+ "model_type": "deberta-v2",
24
+ "norm_rel_ebd": "layer_norm",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "pad_token_id": 0,
28
+ "pooler_dropout": 0,
29
+ "pooler_hidden_act": "gelu",
30
+ "pooler_hidden_size": 768,
31
+ "pos_att_type": [
32
+ "p2c",
33
+ "c2p"
34
+ ],
35
+ "position_biased_input": false,
36
+ "position_buckets": 256,
37
+ "relative_attention": true,
38
+ "share_att_key": true,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.29.1",
41
+ "type_vocab_size": 0,
42
+ "vocab_size": 128100
43
+ }
models/dbertaInjection/gitattributes.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/dbertaInjection/gitignore.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
models/dbertaInjection/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
models/dbertaInjection/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
models/dbertaInjection/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/dbertaInjection/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "sp_model_kwargs": {},
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }
models/dbertaInjection/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69a93fd6f6ff40d061d2890cc2c653840203973675b926ca88eacda8a741022
3
+ size 3963
models/detoxify/config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 514,
15
+ "model_type": "roberta",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 1,
19
+ "type_vocab_size": 1,
20
+ "vocab_size": 50265
21
+ }
models/detoxify/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/detoxify/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/detoxify/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/gibberish/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "AutoNLP",
3
+ "_num_labels": 4,
4
+ "activation": "gelu",
5
+ "architectures": [
6
+ "DistilBertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "dim": 768,
10
+ "dropout": 0.1,
11
+ "hidden_dim": 3072,
12
+ "id2label": {
13
+ "0": "clean",
14
+ "1": "mild gibberish",
15
+ "2": "noise",
16
+ "3": "word salad"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "label2id": {
20
+ "clean": 0,
21
+ "mild gibberish": 1,
22
+ "noise": 2,
23
+ "word salad": 3
24
+ },
25
+ "max_length": 64,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "distilbert",
28
+ "n_heads": 12,
29
+ "n_layers": 6,
30
+ "pad_token_id": 0,
31
+ "padding": "max_length",
32
+ "problem_type": "single_label_classification",
33
+ "qa_dropout": 0.1,
34
+ "seq_classif_dropout": 0.2,
35
+ "sinusoidal_pos_embds": false,
36
+ "tie_weights_": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.15.0",
39
+ "vocab_size": 30522
40
+ }
models/gibberish/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
models/gibberish/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/gibberish/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "AutoNLP", "tokenizer_class": "DistilBertTokenizer"}
models/gibberish/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/multi-qa-mpnet-base-dot-v1/1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
models/multi-qa-mpnet-base-dot-v1/README.md ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ datasets:
8
+ - flax-sentence-embeddings/stackexchange_xml
9
+ - ms_marco
10
+ - gooaq
11
+ - yahoo_answers_topics
12
+ - search_qa
13
+ - eli5
14
+ - natural_questions
15
+ - trivia_qa
16
+ - embedding-data/QQP
17
+ - embedding-data/PAQ_pairs
18
+ - embedding-data/Amazon-QA
19
+ - embedding-data/WikiAnswers
20
+
21
+ ---
22
+
23
+ # multi-qa-mpnet-base-dot-v1
24
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for **semantic search**. It has been trained on 215M (question, answer) pairs from diverse sources. For an introduction to semantic search, have a look at: [SBERT.net - Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html)
25
+
26
+
27
+ ## Usage (Sentence-Transformers)
28
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
29
+
30
+ ```
31
+ pip install -U sentence-transformers
32
+ ```
33
+
34
+ Then you can use the model like this:
35
+ ```python
36
+ from sentence_transformers import SentenceTransformer, util
37
+
38
+ query = "How many people live in London?"
39
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
40
+
41
+ #Load the model
42
+ model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
43
+
44
+ #Encode query and documents
45
+ query_emb = model.encode(query)
46
+ doc_emb = model.encode(docs)
47
+
48
+ #Compute dot score between query and all document embeddings
49
+ scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
50
+
51
+ #Combine docs & scores
52
+ doc_score_pairs = list(zip(docs, scores))
53
+
54
+ #Sort by decreasing score
55
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
56
+
57
+ #Output passages & scores
58
+ for doc, score in doc_score_pairs:
59
+ print(score, doc)
60
+ ```
61
+
62
+
63
+ ## Usage (HuggingFace Transformers)
64
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the correct pooling-operation on-top of the contextualized word embeddings.
65
+
66
+ ```python
67
+ from transformers import AutoTokenizer, AutoModel
68
+ import torch
69
+
70
+ #CLS Pooling - Take output from first token
71
+ def cls_pooling(model_output):
72
+ return model_output.last_hidden_state[:,0]
73
+
74
+ #Encode text
75
+ def encode(texts):
76
+ # Tokenize sentences
77
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
78
+
79
+ # Compute token embeddings
80
+ with torch.no_grad():
81
+ model_output = model(**encoded_input, return_dict=True)
82
+
83
+ # Perform pooling
84
+ embeddings = cls_pooling(model_output)
85
+
86
+ return embeddings
87
+
88
+
89
+ # Sentences we want sentence embeddings for
90
+ query = "How many people live in London?"
91
+ docs = ["Around 9 Million people live in London", "London is known for its financial district"]
92
+
93
+ # Load model from HuggingFace Hub
94
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
95
+ model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
96
+
97
+ #Encode query and docs
98
+ query_emb = encode(query)
99
+ doc_emb = encode(docs)
100
+
101
+ #Compute dot score between query and all document embeddings
102
+ scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
103
+
104
+ #Combine docs & scores
105
+ doc_score_pairs = list(zip(docs, scores))
106
+
107
+ #Sort by decreasing score
108
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
109
+
110
+ #Output passages & scores
111
+ for doc, score in doc_score_pairs:
112
+ print(score, doc)
113
+ ```
114
+
115
+ ## Technical Details
116
+
117
+ In the following some technical details how this model must be used:
118
+
119
+ | Setting | Value |
120
+ | --- | :---: |
121
+ | Dimensions | 768 |
122
+ | Produces normalized embeddings | No |
123
+ | Pooling-Method | CLS pooling |
124
+ | Suitable score functions | dot-product (e.g. `util.dot_score`) |
125
+
126
+ ----
127
+
128
+
129
+ ## Background
130
+
131
+ The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
132
+ contrastive learning objective. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
133
+
134
+ We developped this model during the
135
+ [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
136
+ organized by Hugging Face. We developped this model as part of the project:
137
+ [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
138
+
139
+ ## Intended uses
140
+
141
+ Our model is intented to be used for semantic search: It encodes queries / questions and text paragraphs in a dense vector space. It finds relevant documents for the given passages.
142
+
143
+ Note that there is a limit of 512 word pieces: Text longer than that will be truncated. Further note that the model was just trained on input text up to 250 word pieces. It might not work well for longer text.
144
+
145
+
146
+
147
+ ## Training procedure
148
+
149
+ The full training script is accessible in this current repository: `train_script.py`.
150
+
151
+ ### Pre-training
152
+
153
+ We use the pretrained [`mpnet-base`](https://huggingface.co/microsoft/mpnet-base) model. Please refer to the model card for more detailed information about the pre-training procedure.
154
+
155
+ #### Training
156
+
157
+ We use the concatenation from multiple datasets to fine-tune our model. In total we have about 215M (question, answer) pairs.
158
+ We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
159
+
160
+ The model was trained with [MultipleNegativesRankingLoss](https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss) using CLS-pooling, dot-product as similarity function, and a scale of 1.
161
+
162
+
163
+
164
+
165
+ | Dataset | Number of training tuples |
166
+ |--------------------------------------------------------|:--------------------------:|
167
+ | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs from WikiAnswers | 77,427,422 |
168
+ | [PAQ](https://github.com/facebookresearch/PAQ) Automatically generated (Question, Paragraph) pairs for each paragraph in Wikipedia | 64,371,441 |
169
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs from all StackExchanges | 25,316,456 |
170
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs from all StackExchanges | 21,396,559 |
171
+ | [MS MARCO](https://microsoft.github.io/msmarco/) Triplets (query, answer, hard_negative) for 500k queries from Bing search engine | 17,579,773 |
172
+ | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) (query, answer) pairs for 3M Google queries and Google featured snippet | 3,012,496 |
173
+ | [Amazon-QA](http://jmcauley.ucsd.edu/data/amazon/qa/) (Question, Answer) pairs from Amazon product pages | 2,448,839
174
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) pairs from Yahoo Answers | 1,198,260 |
175
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) pairs from Yahoo Answers | 681,164 |
176
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) pairs from Yahoo Answers | 659,896 |
177
+ | [SearchQA](https://huggingface.co/datasets/search_qa) (Question, Answer) pairs for 140k questions, each with Top5 Google snippets on that question | 582,261 |
178
+ | [ELI5](https://huggingface.co/datasets/eli5) (Question, Answer) pairs from Reddit ELI5 (explainlikeimfive) | 325,475 |
179
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions pairs (titles) | 304,525 |
180
+ | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Question, Duplicate_Question, Hard_Negative) triplets for Quora Questions Pairs dataset | 103,663 |
181
+ | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) (Question, Paragraph) pairs for 100k real Google queries with relevant Wikipedia paragraph | 100,231 |
182
+ | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) (Question, Paragraph) pairs from SQuAD2.0 dataset | 87,599 |
183
+ | [TriviaQA](https://huggingface.co/datasets/trivia_qa) (Question, Evidence) pairs | 73,346 |
184
+ | **Total** | **214,988,242** |
models/multi-qa-mpnet-base-dot-v1/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "C:\\Users\\prajwal.chatra/.cache\\torch\\sentence_transformers\\sentence-transformers_multi-qa-mpnet-base-dot-v1\\",
3
+ "architectures": [
4
+ "MPNetModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "mpnet",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "relative_attention_num_buckets": 32,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.22.1",
23
+ "vocab_size": 30527
24
+ }
models/multi-qa-mpnet-base-dot-v1/config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
models/multi-qa-mpnet-base-dot-v1/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
models/multi-qa-mpnet-base-dot-v1/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
models/multi-qa-mpnet-base-dot-v1/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "[UNK]"
15
+ }
models/multi-qa-mpnet-base-dot-v1/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/multi-qa-mpnet-base-dot-v1/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "do_lower_case": true,
5
+ "eos_token": "</s>",
6
+ "mask_token": "<mask>",
7
+ "model_max_length": 512,
8
+ "name_or_path": "C:\\Users\\prajwal.chatra/.cache\\torch\\sentence_transformers\\sentence-transformers_multi-qa-mpnet-base-dot-v1\\",
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "special_tokens_map_file": null,
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "MPNetTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
models/multi-qa-mpnet-base-dot-v1/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/nli-MiniLM2-L6-H768/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nreimers/MiniLMv2-L6-H768-distilled-from-RoBERTa-Large",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "contradiction",
15
+ "1": "entailment",
16
+ "2": "neutral"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "contradiction": 0,
22
+ "entailment": 1,
23
+ "neutral": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 6,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "transformers_version": "4.6.1",
33
+ "type_vocab_size": 1,
34
+ "use_cache": true,
35
+ "vocab_size": 50265
36
+ }
models/nli-MiniLM2-L6-H768/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/nli-MiniLM2-L6-H768/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
models/nli-MiniLM2-L6-H768/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/nli-MiniLM2-L6-H768/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "nreimers/MiniLMv2-L6-H768-distilled-from-RoBERTa-Large"}
models/nli-MiniLM2-L6-H768/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/restricted-dberta-base-zeroshot-v2/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
models/restricted-dberta-base-zeroshot-v2/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-base",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "id2label": {
11
+ "0": "entailment",
12
+ "1": "not_entailment"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "label2id": {
17
+ "entailment": 0,
18
+ "not_entailment": 1
19
+ },
20
+ "layer_norm_eps": 1e-07,
21
+ "max_position_embeddings": 512,
22
+ "max_relative_positions": -1,
23
+ "model_type": "deberta-v2",
24
+ "norm_rel_ebd": "layer_norm",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "pad_token_id": 0,
28
+ "pooler_dropout": 0,
29
+ "pooler_hidden_act": "gelu",
30
+ "pooler_hidden_size": 768,
31
+ "pos_att_type": [
32
+ "p2c",
33
+ "c2p"
34
+ ],
35
+ "position_biased_input": false,
36
+ "position_buckets": 256,
37
+ "relative_attention": true,
38
+ "share_att_key": true,
39
+ "torch_dtype": "float16",
40
+ "transformers_version": "4.37.2",
41
+ "type_vocab_size": 0,
42
+ "vocab_size": 128100
43
+ }
models/restricted-dberta-base-zeroshot-v2/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
models/restricted-dberta-base-zeroshot-v2/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
models/restricted-dberta-base-zeroshot-v2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff