Spaces:
Sleeping
Sleeping
File size: 5,720 Bytes
77961b6 0c7d7d0 77961b6 3a0ee14 77961b6 3a0ee14 77961b6 3a0ee14 0c7d7d0 3a0ee14 77961b6 3a0ee14 77961b6 3a0ee14 77961b6 3a0ee14 77961b6 54410d4 3a0ee14 0c7d7d0 d65e913 0c7d7d0 77961b6 3a0ee14 77961b6 0c7d7d0 54410d4 77961b6 3a0ee14 0c7d7d0 d65e913 77961b6 54410d4 77961b6 3a0ee14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import datasets
import logging
import json
import pandas as pd
def text_classificaiton_match_label_case_unsensative(id2label_mapping, label):
for model_label in id2label_mapping.keys():
if model_label.upper() == label.upper():
return model_label, label
return None, label
def text_classification_map_model_and_dataset_labels(id2label, dataset_features):
id2label_mapping = {id2label[k]: None for k in id2label.keys()}
dataset_labels = None
for feature in dataset_features.values():
if not isinstance(feature, datasets.ClassLabel):
continue
if len(feature.names) != len(id2label_mapping.keys()):
continue
dataset_labels = feature.names
# Try to match labels
for label in feature.names:
if label in id2label_mapping.keys():
model_label = label
else:
# Try to find case unsensative
model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
if model_label is not None:
id2label_mapping[model_label] = label
else:
print(f"Label {label} is not found in model labels")
return id2label_mapping, dataset_labels
'''
params:
column_mapping: dict
example: {
"text": "sentences",
"label": {
"label0": "LABEL_0",
"label1": "LABEL_1"
}
}
ppl: pipeline
'''
def check_column_mapping_keys_validity(column_mapping, ppl):
# get the element in all the list elements
column_mapping = json.loads(column_mapping)
if "data" not in column_mapping.keys():
return True
user_labels = set([pair[0] for pair in column_mapping["data"]])
model_labels = set([pair[1] for pair in column_mapping["data"]])
id2label = ppl.model.config.id2label
original_labels = set(id2label.values())
return user_labels == model_labels == original_labels
def infer_text_input_column(column_mapping, dataset_features):
# Check whether we need to infer the text input column
infer_text_input_column = True
feature_map_df = None
if "text" in column_mapping.keys():
dataset_text_column = column_mapping["text"]
if dataset_text_column in dataset_features.keys():
infer_text_input_column = False
else:
logging.warning(f"Provided {dataset_text_column} is not in Dataset columns")
if infer_text_input_column:
# Try to retrieve one
candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
feature_map_df = pd.DataFrame({
"Dataset Features": [candidates[0]],
"Model Input Features": ["text"]
})
if len(candidates) > 0:
logging.debug(f"Candidates are {candidates}")
column_mapping["text"] = candidates[0]
return column_mapping, feature_map_df
def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
# We assume dataset is ok here
ds = datasets.load_dataset(d_id, config)[split]
try:
dataset_features = ds.features
except AttributeError:
# Dataset does not have features, need to provide everything
return None, None, None, None, None
column_mapping, feature_map_df = infer_text_input_column(column_mapping, dataset_features)
# Load dataset as DataFrame
df = ds.to_pandas()
# Retrieve all labels
id2label_mapping = {}
id2label = ppl.model.config.id2label
label2id = {v: k for k, v in id2label.items()}
# Infer labels
id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
id2label_mapping_dataset_model = {
v: k for k, v in id2label_mapping.items()
}
if "data" in column_mapping.keys():
if isinstance(column_mapping["data"], list):
# Use the column mapping passed by user
for user_label, model_label in column_mapping["data"]:
id2label_mapping[model_label] = user_label
elif None in id2label_mapping.values():
column_mapping["label"] = {
i: None for i in id2label.keys()
}
return column_mapping, None, None, None, feature_map_df
id2label_df = pd.DataFrame({
"Dataset Labels": dataset_labels,
"Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
})
# get a sample prediction from the model on the dataset
prediction_input = None
prediction_result = None
try:
# Use the first item to test prediction
prediction_input = df.head(1).at[0, column_mapping["text"]]
results = ppl({"text": prediction_input}, top_k=None)
prediction_result = {
f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
}
except Exception as e:
# Pipeline prediction failed, need to provide labels
print(e, '>>>> error')
return column_mapping, prediction_input, None, id2label_df, feature_map_df
prediction_result = {
f'[{label2id[result["label"]]}]{result["label"]}(original) - {id2label_mapping[result["label"]]}(mapped)': result["score"] for result in results
}
if "data" not in column_mapping.keys():
# Column mapping should contain original model labels
column_mapping["label"] = {
str(i): id2label_mapping_dataset_model[label] for i, label in zip(id2label.keys(), dataset_labels)
}
return column_mapping, prediction_input, prediction_result, id2label_df, feature_map_df
|