pii-NER-20240509-023506 / model_code.json
rungalileo's picture
Upload model_code.json with huggingface_hub
8dec806 verified
raw
history blame contribute delete
No virus
7.37 kB
{"MAX_N_TOKENS": 256, "LABEL_ENCODING": "BIO", "LABEL2ID": {"O": 0, "B-CREDIT_CARD_NUMBER_LAST_4": 1, "I-CREDIT_CARD_NUMBER_LAST_4": 2, "B-VEHICLE_VRM": 3, "I-VEHICLE_VRM": 4, "B-API_KEY": 5, "I-API_KEY": 6, "B-CREDIT_CARD_CVV": 7, "B-URL": 8, "I-URL": 9, "B-PHONE_NUMBER": 10, "I-PHONE_NUMBER": 11, "B-BIC": 12, "I-BIC": 13, "B-USERNAME": 14, "I-USERNAME": 15, "B-IBAN": 16, "I-IBAN": 17, "B-DATE_OF_BIRTH": 18, "I-DATE_OF_BIRTH": 19, "B-NAME": 20, "I-NAME": 21, "B-EMAIL": 22, "I-EMAIL": 23, "B-PREFIX": 24, "B-CREDIT_CARD_NUMBER": 25, "I-CREDIT_CARD_NUMBER": 26, "B-OTHER_ID": 27, "I-OTHER_ID": 28, "B-SSN_LAST_4": 29, "I-SSN_LAST_4": 30, "B-IP_V4": 31, "I-IP_V4": 32, "B-SSN": 33, "I-SSN": 34, "B-VEHICLE_VIN": 35, "I-VEHICLE_VIN": 36, "B-CREDIT_CARD_EXP_DATE": 37, "I-CREDIT_CARD_EXP_DATE": 38, "B-IP_V6": 39, "I-IP_V6": 40, "B-DRIVER_LICENSE": 41, "I-DRIVER_LICENSE": 42, "I-CREDIT_CARD_CVV": 43, "B-PASSWORD": 44, "I-PASSWORD": 45, "B-PIN": 46, "I-PIN": 47, "B-DATE": 48, "I-DATE": 49, "B-STATE": 50, "B-PHONE_IMEI": 51, "I-PHONE_IMEI": 52, "B-STREET_NAME_AND_NUMBER": 53, "I-STREET_NAME_AND_NUMBER": 54, "B-MAC": 55, "I-MAC": 56, "B-ACCOUNT_NUMBER": 57, "I-ACCOUNT_NUMBER": 58, "B-ZIP_CODE": 59, "I-ZIP_CODE": 60, "B-COUNTRY": 61, "I-COUNTRY": 62, "I-STATE": 63, "B-CITY": 64, "I-CITY": 65, "B-COMPANY_NAME": 66, "I-COMPANY_NAME": 67, "I-PREFIX": 68}, "LABELS_TO_PROD": {"O": "O", "B-NAME": "name", "B-SSN": "ssn", "B-SSN_LAST_4": "ssn", "B-DATE_OF_BIRTH": "date_of_birth", "B-DATE": "date", "B-EMAIL": "email", "B-PHONE_NUMBER": "phone_number", "B-USERNAME": "username", "B-STREET_NAME_AND_NUMBER": "address", "B-CITY": "address", "B-STATE": "address", "B-ZIP_CODE": "address", "B-CREDIT_CARD_CVV": "credit_card_info", "B-CREDIT_CARD_NUMBER": "credit_card_info", "B-CREDIT_CARD_EXP_DATE": "credit_card_info", "B-CREDIT_CARD_NUMBER_LAST_4": "credit_card_info", "B-IP_V4": "network_info", "B-IP_V6": "network_info", "B-MAC": "network_info", "B-PASSWORD": "password", "B-API_KEY": "password", "B-ACCOUNT_NUMBER": "account_info", "B-BIC": "account_info", "B-IBAN": "account_info", "B-DRIVER_LICENSE": "personal_id", "B-PHONE_IMEI": "personal_id", "B-VEHICLE_VIN": "personal_id", "B-VEHICLE_VRM": "personal_id", "B-OTHER_ID": "personal_id", "B-PIN": "personal_id", "B-URL": "url", "B-COMPANY_NAME": "O", "B-COUNTRY": "O", "B-PREFIX": "O", "I-NAME": "name", "I-SSN": "ssn", "I-SSN_LAST_4": "ssn", "I-DATE_OF_BIRTH": "date_of_birth", "I-DATE": "date", "I-EMAIL": "email", "I-PHONE_NUMBER": "phone_number", "I-USERNAME": "username", "I-STREET_NAME_AND_NUMBER": "address", "I-CITY": "address", "I-STATE": "address", "I-ZIP_CODE": "address", "I-CREDIT_CARD_CVV": "credit_card_info", "I-CREDIT_CARD_NUMBER": "credit_card_info", "I-CREDIT_CARD_EXP_DATE": "credit_card_info", "I-CREDIT_CARD_NUMBER_LAST_4": "credit_card_info", "I-IP_V4": "network_info", "I-IP_V6": "network_info", "I-MAC": "network_info", "I-PASSWORD": "password", "I-API_KEY": "password", "I-ACCOUNT_NUMBER": "account_info", "I-BIC": "account_info", "I-IBAN": "account_info", "I-DRIVER_LICENSE": "personal_id", "I-PHONE_IMEI": "personal_id", "I-VEHICLE_VIN": "personal_id", "I-VEHICLE_VRM": "personal_id", "I-OTHER_ID": "personal_id", "I-PIN": "personal_id", "I-URL": "url", "I-COMPANY_NAME": "O", "I-COUNTRY": "O", "I-PREFIX": "O"}, "GRANULAR_BIO_TO_LABELS": {"O": "O", "B-NAME": "B-NAME", "B-SSN": "B-SSN", "B-SSN_LAST_4": "B-SSN_LAST_4", "B-DATE_OF_BIRTH": "B-DATE_OF_BIRTH", "B-DATE": "B-DATE", "B-EMAIL": "B-EMAIL", "B-PHONE_NUMBER": "B-PHONE_NUMBER", "B-USERNAME": "B-USERNAME", "B-STREET_NAME_AND_NUMBER": "B-STREET_NAME_AND_NUMBER", "B-CITY": "B-CITY", "B-STATE": "B-STATE", "B-ZIP_CODE": "B-ZIP_CODE", "B-CREDIT_CARD_CVV": "B-CREDIT_CARD_CVV", "B-CREDIT_CARD_NUMBER": "B-CREDIT_CARD_NUMBER", "B-CREDIT_CARD_EXP_DATE": "B-CREDIT_CARD_EXP_DATE", "B-CREDIT_CARD_NUMBER_LAST_4": "B-CREDIT_CARD_NUMBER_LAST_4", "B-IP_V4": "B-IP_V4", "B-IP_V6": "B-IP_V6", "B-MAC": "B-MAC", "B-PASSWORD": "B-PASSWORD", "B-API_KEY": "B-API_KEY", "B-ACCOUNT_NUMBER": "B-ACCOUNT_NUMBER", "B-BIC": "B-BIC", "B-IBAN": "B-IBAN", "B-DRIVER_LICENSE": "B-DRIVER_LICENSE", "B-PHONE_IMEI": "B-PHONE_IMEI", "B-VEHICLE_VIN": "B-VEHICLE_VIN", "B-VEHICLE_VRM": "B-VEHICLE_VRM", "B-OTHER_ID": "B-OTHER_ID", "B-PIN": "B-PIN", "B-URL": "B-URL", "B-COMPANY_NAME": "B-COMPANY_NAME", "B-COUNTRY": "B-COUNTRY", "B-PREFIX": "B-PREFIX", "I-NAME": "I-NAME", "I-SSN": "I-SSN", "I-SSN_LAST_4": "I-SSN_LAST_4", "I-DATE_OF_BIRTH": "I-DATE_OF_BIRTH", "I-DATE": "I-DATE", "I-EMAIL": "I-EMAIL", "I-PHONE_NUMBER": "I-PHONE_NUMBER", "I-USERNAME": "I-USERNAME", "I-STREET_NAME_AND_NUMBER": "I-STREET_NAME_AND_NUMBER", "I-CITY": "I-CITY", "I-STATE": "I-STATE", "I-ZIP_CODE": "I-ZIP_CODE", "I-CREDIT_CARD_CVV": "I-CREDIT_CARD_CVV", "I-CREDIT_CARD_NUMBER": "I-CREDIT_CARD_NUMBER", "I-CREDIT_CARD_EXP_DATE": "I-CREDIT_CARD_EXP_DATE", "I-CREDIT_CARD_NUMBER_LAST_4": "I-CREDIT_CARD_NUMBER_LAST_4", "I-IP_V4": "I-IP_V4", "I-IP_V6": "I-IP_V6", "I-MAC": "I-MAC", "I-PASSWORD": "I-PASSWORD", "I-API_KEY": "I-API_KEY", "I-ACCOUNT_NUMBER": "I-ACCOUNT_NUMBER", "I-BIC": "I-BIC", "I-IBAN": "I-IBAN", "I-DRIVER_LICENSE": "I-DRIVER_LICENSE", "I-PHONE_IMEI": "I-PHONE_IMEI", "I-VEHICLE_VIN": "I-VEHICLE_VIN", "I-VEHICLE_VRM": "I-VEHICLE_VRM", "I-OTHER_ID": "I-OTHER_ID", "I-PIN": "I-PIN", "I-URL": "I-URL", "I-COMPANY_NAME": "I-COMPANY_NAME", "I-COUNTRY": "I-COUNTRY", "I-PREFIX": "I-PREFIX"}, "RE_USE_WEIGHTS": true, "original_model_ckpt": "lakshyakh93/deberta_finetuned_pii", "model_code": "def restrict_model_to_new_classes(model_ckpt, label2id: dict[str, int], encoding: str = \"IO\", re_use_weights: bool = True):\n model = AutoModelForTokenClassification.from_pretrained(model_ckpt) #, token=HF_TOKEN)\n\n # Create a new head with all the classes.\n # new_classifier = Linear(in_features=model.config.pooler_hidden_size, out_features=len(label2id), device=DEVICE)\n new_classifier = Linear(in_features=model.config.hidden_size, out_features=len(label2id), device=DEVICE)\n\n if encoding == \"BIO\":\n # Re-use weights for existing classes in the model\n if re_use_weights:\n new_classes = [class_ for class_ in label2id if class_ not in model.config.label2id]\n existing_classes = [class_ for class_ in label2id if class_ in model.config.label2id]\n for class_ in existing_classes:\n class_old_idx = model.config.label2id[class_]\n class_new_idx = label2id[class_]\n with torch.no_grad():\n new_classifier.weight[class_new_idx, ...] = Parameter(model.classifier.weight[class_old_idx, ...])\n print(f\"Keeping {len(existing_classes)} dimensions from the original model: {existing_classes}\")\n print(f\"Didn't find weights for {len(new_classes)} labels: {new_classes}\")\n print()\n\n elif encoding == \"IO\":\n # TODO - any way to reuse the heads they provided ? By aggregating the B- and I- ? not sure\n pass\n else:\n raise ValueError(\"Not supporting encoding \", encoding)\n\n model.classifier = new_classifier\n model.config.label2id = label2id\n model.config.id2label = {new_id: class_ for class_, new_id in label2id.items()}\n\n model.config.num_labels = len(label2id)\n model.num_labels = len(label2id)\n\n return model.to(DEVICE)\n", "dataset": "MLexperiments/pii-dataset-NER-labels-v3.8", "WandB_link": "", "model_save_ckpt": "MLexperiments/pii-NER-20240509-023506"}