Daniel Ferreira commited on
Commit
3665cbf
1 Parent(s): 5fea9ee

first commit

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. README.md +39 -0
  3. evaluate.py +273 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ __pychache__
README.md CHANGED
@@ -1,3 +1,42 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ This repo has an optimized version of [Detoxify](https://github.com/unitaryai/detoxify/), which needs less disk space and less memory at the cost of just a little bit of accuracy.
6
+
7
+ This is an experiment for me to learn how to use [🤗 Optimum](https://huggingface.co/docs/optimum/index).
8
+
9
+ # Usage
10
+
11
+ Loading the model requires the [🤗 Optimum](https://huggingface.co/docs/optimum/index) library installed.
12
+
13
+ ```python
14
+ from optimum.onnxruntime import ORTModelForSequenceClassification
15
+ from optimum.pipelines import pipeline as opt_pipeline
16
+ from transformers import AutoTokenizer
17
+
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained("dcferreira/detoxify-optimized")
20
+ model = ORTModelForSequenceClassification.from_pretrained("dcferreira/detoxify-optimized")
21
+ pipe = opt_pipeline(
22
+ model=model,
23
+ task="text-classification",
24
+ function_to_apply="sigmoid",
25
+ accelerator="ort",
26
+ tokenizer=tokenizer,
27
+ return_all_scores=True, # return scores for all the labels, model was trained as multilabel
28
+ )
29
+
30
+ print(pipe(['example text','exemple de texte','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста']))
31
+ ```
32
+
33
+ # Performance
34
+
35
+ The table below compares some statistics on running the original model, vs the original model with the [onnxruntime](https://onnxruntime.ai/), vs optimizing the model with onnxruntime.
36
+
37
+
38
+ | model | Accuracy | Samples p/ second (CPU) | Samples p/ second (GPU) | GPU VRAM | Disk Space |
39
+ |----------------|----------|-------------------------|-------------------------|----------|------------|
40
+ | original | 92.1083 | 16 | 250 | 3GB | 1.1GB |
41
+ | ort | 92.1067 | 19 | 340 | 4GB | 1.1GB |
42
+ | optimized (O4) | 92.1031 | 14 | 650 | 2GB | 540MB |
evaluate.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Callable, Optional, Tuple
5
+
6
+ import pandas as pd
7
+ from datasets import Dataset
8
+ from optimum.onnxruntime import (
9
+ ORTModelForSequenceClassification,
10
+ ORTOptimizer,
11
+ ORTQuantizer,
12
+ )
13
+ from optimum.onnxruntime.configuration import (
14
+ AutoCalibrationConfig,
15
+ AutoOptimizationConfig,
16
+ AutoQuantizationConfig,
17
+ )
18
+ from optimum.pipelines import pipeline as opt_pipeline
19
+ from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit
20
+ from sklearn.metrics import roc_auc_score
21
+ from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, pipeline
22
+ from transformers.pipelines.base import KeyDataset
23
+
24
+ from detoxify.detoxify import load_checkpoint
25
+
26
+
27
+ def get_gpu_utilization() -> int:
28
+ nvmlInit()
29
+ handle = nvmlDeviceGetHandleByIndex(0)
30
+ info = nvmlDeviceGetMemoryInfo(handle)
31
+ return info.used // 1024**2 # memory in MB
32
+
33
+
34
+ def load_data(base_path: Path, nrows: Optional[int] = None) -> pd.DataFrame:
35
+ labels_path = base_path / "test_labels.csv"
36
+ test_path = base_path / "test.csv"
37
+
38
+ labels_df = pd.read_csv(labels_path, index_col=0, nrows=nrows)
39
+ test_df = pd.read_csv(test_path, index_col=0, nrows=nrows)
40
+
41
+ test_df["label"] = labels_df
42
+ return test_df
43
+
44
+
45
+ def get_toxicity(result):
46
+ return list(filter(lambda r: r["label"] == "toxicity", result))[0]["score"]
47
+
48
+
49
+ def evaluate_devices(data_path: Path, evaluate_model_fn: Callable, **kwargs):
50
+ small_df = load_data(data_path, nrows=1000)
51
+ cpu_eval = evaluate_model_fn("cpu", small_df, **kwargs)
52
+
53
+ big_df = load_data(data_path)
54
+ gpu_eval = evaluate_model_fn("cuda:0", big_df, **kwargs)
55
+
56
+ return {
57
+ "scores": gpu_eval["scores"],
58
+ "samples_per_second_cpu": len(small_df) / cpu_eval["time_seconds"],
59
+ "samples_per_second_gpu": len(big_df) / gpu_eval["time_seconds"],
60
+ "gpu_memory_mb": gpu_eval["gpu_memory_mb"],
61
+ }
62
+
63
+
64
+ def evaluate_pipeline(pipe, df):
65
+ results = pipe(
66
+ KeyDataset(Dataset.from_pandas(df), "content"),
67
+ top_k=None,
68
+ batch_size=4,
69
+ padding="longest",
70
+ truncation=True,
71
+ )
72
+ t1 = time.time()
73
+ toxicity_pred = pd.Series(map(get_toxicity, results), index=df.index)
74
+ t2 = time.time()
75
+
76
+ scores = {
77
+ "all": roc_auc_score(df.label, toxicity_pred),
78
+ }
79
+ languages = ["it", "fr", "ru", "pt", "es", "tr"]
80
+ for lang in languages:
81
+ idx = df.lang == lang
82
+ scores[lang] = roc_auc_score(df[idx].label, toxicity_pred[idx])
83
+
84
+ return {
85
+ "scores": scores,
86
+ "time_seconds": t2 - t1,
87
+ "gpu_memory_mb": get_gpu_utilization(),
88
+ }
89
+
90
+
91
+ def load_original_model(device: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
92
+ model, tokenizer, class_names = load_checkpoint(
93
+ model_type="multilingual", device=device
94
+ )
95
+ identity_classes = [
96
+ "male",
97
+ "female",
98
+ "homosexual_gay_or_lesbian",
99
+ "christian",
100
+ "jewish",
101
+ "muslim",
102
+ "black",
103
+ "white",
104
+ "psychiatric_or_mental_illness",
105
+ ]
106
+ model.config.id2label = {n: c for n, c in enumerate(class_names + identity_classes)}
107
+ model.config.label2id = {c: n for n, c in enumerate(class_names + identity_classes)}
108
+
109
+ return model, tokenizer
110
+
111
+
112
+ def evaluate_original_model(device: str, test_df: pd.DataFrame):
113
+ model, tokenizer = load_original_model(device)
114
+
115
+ pipe = pipeline(
116
+ model=model,
117
+ task="text-classification",
118
+ tokenizer=tokenizer,
119
+ function_to_apply="sigmoid",
120
+ device=device,
121
+ )
122
+
123
+ return evaluate_pipeline(pipe, test_df)
124
+
125
+
126
+ def save_original_model(base_path: Path = Path(".")):
127
+ model, tokenizer = load_original_model("cpu")
128
+ pipe = pipeline(
129
+ model=model,
130
+ task="text-classification",
131
+ tokenizer=tokenizer,
132
+ function_to_apply="sigmoid",
133
+ )
134
+ pipe.save_pretrained(base_path)
135
+
136
+
137
+ def evaluate_ort_model(device: str, test_df: pd.DataFrame, base_path: Path = Path(".")):
138
+ model = ORTModelForSequenceClassification.from_pretrained(base_path, export=True)
139
+ tokenizer = AutoTokenizer.from_pretrained(base_path, device=device)
140
+
141
+ pipe = opt_pipeline(
142
+ model=model,
143
+ task="text-classification",
144
+ tokenizer=tokenizer,
145
+ function_to_apply="sigmoid",
146
+ device=device,
147
+ accelerator="ort",
148
+ )
149
+
150
+ return evaluate_pipeline(pipe, test_df)
151
+
152
+
153
+ def evaluate_ort_optimize_model(
154
+ device: str, test_df: pd.DataFrame, base_path: Path = Path(".")
155
+ ):
156
+ tokenizer = AutoTokenizer.from_pretrained(base_path, device=device)
157
+
158
+ if not os.path.exists(base_path / "model_optimized.onnx"):
159
+ model = ORTModelForSequenceClassification.from_pretrained(
160
+ base_path, export=True
161
+ )
162
+ # oconfig = AutoOptimizationConfig.O1(fp16=True)
163
+ oconfig = AutoOptimizationConfig.O4()
164
+ optimizer = ORTOptimizer.from_pretrained(model)
165
+ optimizer.optimize(
166
+ save_dir=base_path,
167
+ optimization_config=oconfig,
168
+ )
169
+
170
+ model = ORTModelForSequenceClassification.from_pretrained(
171
+ base_path, file_name="model_optimized.onnx"
172
+ )
173
+ pipe = opt_pipeline(
174
+ model=model,
175
+ task="text-classification",
176
+ function_to_apply="sigmoid",
177
+ device=device,
178
+ accelerator="ort",
179
+ tokenizer=tokenizer,
180
+ )
181
+
182
+ return evaluate_pipeline(pipe, test_df)
183
+
184
+
185
+ def evaluate_ort_quantize_model(
186
+ device: str,
187
+ test_df: pd.DataFrame,
188
+ base_path: Path = Path("."),
189
+ overwrite: bool = False,
190
+ ):
191
+ tokenizer = AutoTokenizer.from_pretrained(base_path, device=device)
192
+
193
+ if overwrite or not os.path.exists(base_path / "model_quantized.onnx"):
194
+ model = ORTModelForSequenceClassification.from_pretrained(
195
+ base_path, export=True
196
+ )
197
+ qconfig = AutoQuantizationConfig.avx2(is_static=True, per_channel=False)
198
+ quantizer = ORTQuantizer.from_pretrained(model)
199
+
200
+ def preprocess_fn(ex):
201
+ return tokenizer(ex["content"])
202
+
203
+ # Calibrate based on the dataset
204
+ calibration_dataset = (
205
+ Dataset.from_pandas(test_df)
206
+ .map(preprocess_fn)
207
+ .select_columns(["input_ids", "attention_mask"])
208
+ )
209
+ calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
210
+ ranges = quantizer.fit(
211
+ dataset=calibration_dataset,
212
+ calibration_config=calibration_config,
213
+ operators_to_quantize=qconfig.operators_to_quantize,
214
+ )
215
+
216
+ quantizer.quantize(
217
+ save_dir=base_path,
218
+ quantization_config=qconfig,
219
+ calibration_tensors_range=ranges,
220
+ )
221
+
222
+ model = ORTModelForSequenceClassification.from_pretrained(
223
+ base_path,
224
+ file_name="model_quantized.onnx",
225
+ foo="bar",
226
+ )
227
+ pipe = opt_pipeline(
228
+ model=model,
229
+ task="text-classification",
230
+ function_to_apply="sigmoid",
231
+ device=device,
232
+ accelerator="ort",
233
+ tokenizer=tokenizer,
234
+ )
235
+
236
+ return evaluate_pipeline(pipe, test_df)
237
+
238
+
239
+ if __name__ == "__main__":
240
+ import argparse
241
+
242
+ parser = argparse.ArgumentParser()
243
+ parser.add_argument(
244
+ "data_path",
245
+ type=str,
246
+ help="Path to jigsaw multilingual toxic comment data. "
247
+ 'For example: "jigsaw_data/jigsaw-multilingual-toxic-comment-classification"',
248
+ )
249
+ parser.add_argument(
250
+ "--models_path",
251
+ type=str,
252
+ default=".",
253
+ help="Path to model weights directory (root of the repo)",
254
+ )
255
+ parser.add_argument(
256
+ "model", type=str, help="Model to evaluate (original, ort, optimized, quantized)."
257
+ )
258
+
259
+ args = parser.parse_args()
260
+
261
+ data = Path(args.data_path)
262
+ models_p = Path(args.models_path)
263
+
264
+ if args.model == "original":
265
+ print(evaluate_devices(data, evaluate_original_model))
266
+ elif args.model == "ort":
267
+ print(evaluate_devices(data, evaluate_ort_model, base_path=models_p))
268
+ elif args.model == "optimized":
269
+ print(evaluate_devices(data, evaluate_ort_optimize_model, base_path=models_p))
270
+ elif args.model == "quantized":
271
+ print(evaluate_devices(data, evaluate_ort_quantize_model, base_path=models_p))
272
+ else:
273
+ raise ValueError(f"Invalid model received: {args.model!r}")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas==2.0.1
2
+ optimum[onnxruntime-gpu]==1.8.4
3
+ nvidia-ml-py3==7.352.0
4
+ scikit-learn==1.2.2
5
+ transformers==4.29.1
6
+ datasets==2.12.0