File size: 13,524 Bytes
0c7d7d0
3573a39
 
 
9e4233f
3573a39
9e4233f
35be7f4
 
8e32a09
77961b6
7f86019
db8ac73
3573a39
45c5476
 
7f86019
 
 
 
45c5476
35be7f4
9e4233f
 
fdffa47
 
 
5b8d6d5
 
fdffa47
2694247
 
5b8d6d5
fdffa47
5b8d6d5
 
2694247
 
5b8d6d5
02f1357
fdffa47
9e4233f
3573a39
35be7f4
3573a39
fdffa47
9e4233f
35be7f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3573a39
35be7f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f86019
2694247
7f86019
 
2694247
7f86019
2694247
 
 
 
 
 
7f86019
 
 
 
 
 
 
 
35be7f4
 
9e4233f
 
 
 
 
 
 
 
 
 
 
3573a39
77961b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58c39e0
77961b6
 
 
 
 
 
 
3573a39
 
 
77961b6
 
3a0ee14
 
77961b6
 
 
3573a39
 
3a0ee14
 
 
 
 
 
 
 
 
 
3573a39
 
 
0c7d7d0
 
 
 
 
 
 
 
 
 
3573a39
0c7d7d0
 
3573a39
 
9e4233f
 
 
 
 
 
 
3573a39
 
 
3a0ee14
77961b6
 
3a0ee14
9e4233f
77961b6
 
 
 
 
 
 
 
 
3573a39
 
 
 
 
 
77961b6
 
 
3573a39
9e4233f
3a0ee14
3573a39
 
9e4233f
 
 
 
 
 
 
 
 
3573a39
 
 
 
 
 
9e4233f
0c7d7d0
 
d65e913
0c7d7d0
 
77961b6
3573a39
9e4233f
3573a39
9e4233f
 
 
3573a39
 
9e4233f
77961b6
3573a39
 
 
 
 
 
 
 
9e4233f
 
 
3573a39
9e4233f
 
0607989
9e4233f
 
 
 
 
 
 
 
 
 
7f86019
 
 
 
 
 
3573a39
0607989
9e4233f
 
 
 
 
0607989
9e4233f
 
7f86019
9e4233f
 
0607989
35be7f4
 
7f86019
 
 
 
 
 
 
 
35be7f4
 
 
 
9e4233f
 
 
45c5476
7f86019
 
9e4233f
 
 
 
 
 
3a0ee14
 
 
 
 
 
 
 
9e4233f
3a0ee14
9e4233f
3a0ee14
9e4233f
3573a39
9e4233f
3a0ee14
3573a39
 
 
 
3a0ee14
9e4233f
0c7d7d0
3573a39
9e4233f
 
 
 
77961b6
3573a39
 
 
9e4233f
 
3573a39
9e4233f
 
 
 
 
3573a39
 
 
 
 
 
9e4233f
 
 
3573a39
9e4233f
3573a39
 
 
9e4233f
 
 
3573a39
 
 
 
 
 
 
5f9a95f
 
0607989
 
 
 
 
 
db8ac73
 
 
 
0607989
7055d8b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import json
import logging

import datasets
import huggingface_hub
import pandas as pd
from transformers import pipeline
import requests
import os
from app_env import HF_WRITE_TOKEN

logger = logging.getLogger(__name__)
AUTH_CHECK_URL = "https://huggingface.co/api/whoami-v2"

logger = logging.getLogger(__file__)

class HuggingFaceInferenceAPIResponse:
    def __init__(self, message):
        self.message = message


def get_labels_and_features_from_dataset(ds):
    try:
        dataset_features = ds.features
        label_keys = [i for i in dataset_features.keys() if i.startswith("label")]
        features = [f for f in dataset_features.keys() if not f.startswith("label")]

        if len(label_keys) == 0: # no labels found
            # return everything for post processing
            return list(dataset_features.keys()), list(dataset_features.keys()), None

        labels = None
        if not isinstance(dataset_features[label_keys[0]], datasets.ClassLabel):
            if hasattr(dataset_features[label_keys[0]], "feature"):
                label_feat = dataset_features[label_keys[0]].feature
                labels = label_feat.names
            else:
                labels = ds.unique(label_keys[0])
        else:
            labels = dataset_features[label_keys[0]].names
        return labels, features, label_keys
    except Exception as e:
        logging.warning(
            f"Get Labels/Features Failed for dataset: {e}"
        )
        return None, None, None

def check_model_task(model_id):
    # check if model is valid on huggingface
    try:
        task = huggingface_hub.model_info(model_id).pipeline_tag
        if task is None:
            return None
        return task
    except Exception:
        return None

def get_model_labels(model_id, example_input):
    hf_token = os.environ.get(HF_WRITE_TOKEN, default="")
    payload = {"inputs": example_input, "options": {"use_cache": True}}
    response = hf_inference_api(model_id, hf_token, payload)
    if "error" in response:
        return None
    return extract_from_response(response, "label")

def extract_from_response(data, key):
    results = []

    if isinstance(data, dict):
        res = data.get(key)
        if res is not None:
            results.append(res)

        for value in data.values():
            results.extend(extract_from_response(value, key))

    elif isinstance(data, list):
        for element in data:
            results.extend(extract_from_response(element, key))

    return results

def hf_inference_api(model_id, hf_token, payload):
    hf_inference_api_endpoint = os.environ.get(
        "HF_INFERENCE_ENDPOINT", default="https://api-inference.huggingface.co"
    )
    url = f"{hf_inference_api_endpoint}/models/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.post(url, headers=headers, json=payload)

    if not hasattr(response, "status_code") or response.status_code != 200:
        logger.warning(f"Request to inference API returns {response}")

    try:
        output = response.json()
        if "error" in output and "Input is too long" in output["error"]:
          payload.update({"parameters": {"truncation": True, "max_length": 512}})
          response = requests.post(url, headers=headers, json=payload)
          if not hasattr(response, "status_code") or response.status_code != 200:
              logger.warning(f"Request to inference API returns {response}")
        return response.json()
    except Exception:
        return {"error": response.content}
    
def preload_hf_inference_api(model_id):
    payload = {"inputs": "This is a test", "options": {"use_cache": True, }}
    hf_token = os.environ.get(HF_WRITE_TOKEN, default="")
    hf_inference_api(model_id, hf_token, payload)

def check_model_pipeline(model_id):
    try:
        task = huggingface_hub.model_info(model_id).pipeline_tag
    except Exception:
        return None

    try:
        ppl = pipeline(task=task, model=model_id)

        return ppl
    except Exception:
        return None


def text_classificaiton_match_label_case_unsensative(id2label_mapping, label):
    for model_label in id2label_mapping.keys():
        if model_label.upper() == label.upper():
            return model_label, label
    return None, label


def text_classification_map_model_and_dataset_labels(id2label, dataset_features):
    id2label_mapping = {id2label[k]: None for k in id2label.keys()}
    dataset_labels = None
    for feature in dataset_features.values():
        if not isinstance(feature, datasets.ClassLabel):
            continue
        if len(feature.names) != len(id2label_mapping.keys()):
            continue

        dataset_labels = feature.names
        # Try to match labels
        for label in feature.names:
            if label in id2label_mapping.keys():
                model_label = label
            else:
                # Try to find case unsensative
                model_label, label = text_classificaiton_match_label_case_unsensative(
                    id2label_mapping, label
                )
            if model_label is not None:
                id2label_mapping[model_label] = label
            else:
                print(f"Label {label} is not found in model labels")

    return id2label_mapping, dataset_labels


"""
params:
    column_mapping: dict
    example: {
        "text": "sentences",
        "label": {
            "label0": "LABEL_0",
            "label1": "LABEL_1"
        }
    }
    ppl: pipeline
"""


def check_column_mapping_keys_validity(column_mapping, ppl):
    # get the element in all the list elements
    column_mapping = json.loads(column_mapping)
    if "data" not in column_mapping.keys():
        return True
    user_labels = set([pair[0] for pair in column_mapping["data"]])
    model_labels = set([pair[1] for pair in column_mapping["data"]])

    id2label = ppl.model.config.id2label
    original_labels = set(id2label.values())

    return user_labels == model_labels == original_labels


"""
params:
    column_mapping: dict
    dataset_features: dict
    example: {
        'text': Value(dtype='string', id=None), 
        'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)
    }
"""


def infer_text_input_column(column_mapping, dataset_features):
    # Check whether we need to infer the text input column
    infer_text_input_column = True
    feature_map_df = None

    if "text" in column_mapping.keys():
        dataset_text_column = column_mapping["text"]
        if dataset_text_column in dataset_features.keys():
            infer_text_input_column = False
        else:
            logging.warning(f"Provided {dataset_text_column} is not in Dataset columns")

    if infer_text_input_column:
        # Try to retrieve one
        candidates = [
            f for f in dataset_features if dataset_features[f].dtype == "string"
        ]
        feature_map_df = pd.DataFrame(
            {"Dataset Features": [candidates[0]], "Model Input Features": ["text"]}
        )
        if len(candidates) > 0:
            logging.debug(f"Candidates are {candidates}")
            column_mapping["text"] = candidates[0]

    return column_mapping, feature_map_df


"""
params:
    column_mapping: dict
    id2label_mapping: dict
    example:
    id2label_mapping: {
        'negative': 'negative', 
        'neutral': 'neutral', 
        'positive': 'positive'
        }
"""


def infer_output_label_column(
    column_mapping, id2label_mapping, id2label, dataset_labels
):
    # Check whether we need to infer the output label column
    if "data" in column_mapping.keys():
        if isinstance(column_mapping["data"], list):
            # Use the column mapping passed by user
            for user_label, model_label in column_mapping["data"]:
                id2label_mapping[model_label] = user_label
    elif None in id2label_mapping.values():
        column_mapping["label"] = {i: None for i in id2label.keys()}
        return column_mapping, None

    if "data" not in column_mapping.keys():
        # Column mapping should contain original model labels
        column_mapping["label"] = {
            str(i): id2label_mapping[label]
            for i, label in zip(id2label.keys(), dataset_labels)
        }

    id2label_df = pd.DataFrame(
        {
            "Dataset Labels": dataset_labels,
            "Model Prediction Labels": [
                id2label_mapping[label] for label in dataset_labels
            ],
        }
    )

    return column_mapping, id2label_df


def check_dataset_features_validity(d_id, config, split):
    # We assume dataset is ok here
    ds = datasets.load_dataset(d_id, config, split=split, trust_remote_code=True)
    try:
        dataset_features = ds.features
    except AttributeError:
        # Dataset does not have features, need to provide everything
        return None, None
        # Load dataset as DataFrame
    df = ds.to_pandas()

    return df, dataset_features

def select_the_first_string_column(ds):
    for feature in ds.features.keys():
        if isinstance(ds[0][feature], str):
            return feature
    return None


def get_example_prediction(model_id, dataset_id, dataset_config, dataset_split, hf_token):
    # get a sample prediction from the model on the dataset
    prediction_input = None
    prediction_result = None
    try:
        # Use the first item to test prediction
        ds = datasets.load_dataset(dataset_id, dataset_config, split=dataset_split, trust_remote_code=True)
        if "text" not in ds.features.keys():
            # Dataset does not have text column
            prediction_input = ds[0][select_the_first_string_column(ds)]
        else:
            prediction_input = ds[0]["text"]

        payload = {"inputs": prediction_input, "options": {"use_cache": True}}
        results = hf_inference_api(model_id, hf_token, payload)

        if isinstance(results, dict) and "error" in results.keys():
            if "estimated_time" in results.keys():
                return prediction_input, HuggingFaceInferenceAPIResponse(
                    f"Estimated time: {int(results['estimated_time'])}s. Please try again later.")
            return prediction_input, HuggingFaceInferenceAPIResponse(
                f"Inference Error: {results['error']}.")
        
        while isinstance(results, list):
            if isinstance(results[0], dict):
                break
            results = results[0]
        prediction_result = {
            f'{result["label"]}': result["score"] for result in results
        }
    except Exception as e:
        # inference api prediction failed, show the error message
        logger.error(f"Get example prediction failed {e}")
        return prediction_input, None

    return prediction_input, prediction_result


def get_sample_prediction(ppl, df, column_mapping, id2label_mapping):
    # get a sample prediction from the model on the dataset
    prediction_input = None
    prediction_result = None
    try:
        # Use the first item to test prediction
        prediction_input = df.head(1).at[0, column_mapping["text"]]
        results = ppl({"text": prediction_input}, top_k=None)
        prediction_result = {
            f'{result["label"]}': result["score"] for result in results
        }
    except Exception:
        # Pipeline prediction failed, need to provide labels
        return prediction_input, None

    # Display results in original label and mapped label
    prediction_result = {
        f'{result["label"]}(original) - {id2label_mapping[result["label"]]}(mapped)': result[
            "score"
        ]
        for result in results
    }
    return prediction_input, prediction_result


def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
    # load dataset as pd DataFrame
    # get features column from dataset
    df, dataset_features = check_dataset_features_validity(d_id, config, split)

    column_mapping, feature_map_df = infer_text_input_column(
        column_mapping, dataset_features
    )
    if feature_map_df is None:
        # dataset does not have any features
        return None, None, None, None, None

    # Retrieve all labels
    id2label = ppl.model.config.id2label

    # Infer labels
    id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(
        id2label, dataset_features
    )
    column_mapping, id2label_df = infer_output_label_column(
        column_mapping, id2label_mapping, id2label, dataset_labels
    )
    if id2label_df is None:
        # does not able to infer output label column
        return column_mapping, None, None, None, feature_map_df

    # Get a sample prediction
    prediction_input, prediction_result = get_sample_prediction(
        ppl, df, column_mapping, id2label_mapping
    )
    if prediction_result is None:
        # does not able to get a sample prediction
        return column_mapping, prediction_input, None, id2label_df, feature_map_df

    return (
        column_mapping,
        prediction_input,
        prediction_result,
        id2label_df,
        feature_map_df,
    )


def check_hf_token_validity(hf_token):
    if hf_token == "":
        return False
    if not isinstance(hf_token, str):
        return False
    # use huggingface api to check the token
    headers = {"Authorization": f"Bearer {hf_token}"}
    response = requests.get(AUTH_CHECK_URL, headers=headers)
    if response.status_code != 200:
        return False
    return True