File size: 5,755 Bytes
7eaf90c
1
{"MAX_N_TOKENS": 256, "LABEL_ENCODING": "IO", "LABEL2ID": {"O": 0, "address": 1, "credit_card_info": 2, "date": 3, "date_of_birth": 4, "email": 5, "name": 6, "network_info": 7, "password": 8, "personal_ids": 9, "phone_number": 10, "ssn": 11, "url": 12, "username": 13}, "LABELS_TO_PROD": {"name": "name", "ssn": "ssn", "date_of_birth": "date_of_birth", "date": "date", "address": "address", "personal_ids": "personal_ids", "email": "email", "phone_number": "phone_number", "credit_card_info": "credit_card_info", "password": "password", "network_info": "network_info", "url": "url", "username": "username", "O": "O"}, "GRANULAR_BIO_TO_LABELS": {"B-FIRSTNAME": "name", "B-LASTNAME": "name", "B-MIDDLENAME": "name", "B-NAME": "name", "B-FULLNAME": "name", "B-SSN": "ssn", "B-DOB": "date_of_birth", "B-DATE": "date", "B-BUILDINGNUMBER": "address", "B-STREET": "address", "B-SECONDARYADDRESS": "address", "B-NEARBYGPSCOORDINATE": "address", "B-STREETADDRESS": "address", "B-ACCOUNTNAME": "personal_ids", "B-ACCOUNTNUMBER": "personal_ids", "B-BIC": "personal_ids", "B-BITCOINADDRESS": "personal_ids", "B-ETHEREUMADDRESS": "personal_ids", "B-IBAN": "personal_ids", "B-LITECOINADDRESS": "personal_ids", "B-MASKEDNUMBER": "personal_ids", "B-PHONEIMEI": "personal_ids", "B-VEHICLEVIN": "personal_ids", "B-VEHICLEVRM": "personal_ids", "B-PIN": "personal_ids", "B-EMAIL": "email", "B-PHONENUMBER": "phone_number", "B-PHONE_NUMBER": "phone_number", "B-CREDITCARDCVV": "credit_card_info", "B-CREDITCARDNUMBER": "credit_card_info", "B-PASSWORD": "password", "B-IP": "network_info", "B-IPV4": "network_info", "B-IPV6": "network_info", "B-USERAGENT": "network_info", "B-MAC": "network_info", "B-URL": "url", "B-USERNAME": "username", "B-DISPLAYNAME": "username", "B-AMOUNT": "O", "B-CITY": "O", "B-COMPANYNAME": "O", "B-COMPANY_NAME": "O", "B-COUNTY": "O", "B-CREDITCARDISSUER": "O", "B-CURRENCY": "O", "B-CURRENCYCODE": "O", "B-CURRENCYNAME": "O", "B-CURRENCYSYMBOL": "O", "B-JOBAREA": "O", "B-JOBTITLE": "O", "B-JOBTYPE": "O", "B-JOBDESCRIPTOR": "O", "B-ORDINALDIRECTION": "O", "B-STATE": "O", "B-ZIPCODE": "O", "B-AGE": "O", "B-EYECOLOR": "O", "B-HEIGHT": "O", "B-TIME": "O", "B-PREFIX": "O", "B-SEX": "O", "B-GENDER": "O", "B-SEXTYPE": "O", "B-NUMBER": "O", "B-SUFFIX": "O", "I-FIRSTNAME": "name", "I-LASTNAME": "name", "I-MIDDLENAME": "name", "I-NAME": "name", "I-FULLNAME": "name", "I-SSN": "ssn", "I-DOB": "date_of_birth", "I-DATE": "date", "I-BUILDINGNUMBER": "address", "I-STREET": "address", "I-SECONDARYADDRESS": "address", "I-NEARBYGPSCOORDINATE": "address", "I-STREETADDRESS": "address", "I-ACCOUNTNAME": "personal_ids", "I-ACCOUNTNUMBER": "personal_ids", "I-BIC": "personal_ids", "I-BITCOINADDRESS": "personal_ids", "I-ETHEREUMADDRESS": "personal_ids", "I-IBAN": "personal_ids", "I-LITECOINADDRESS": "personal_ids", "I-MASKEDNUMBER": "personal_ids", "I-PHONEIMEI": "personal_ids", "I-VEHICLEVIN": "personal_ids", "I-VEHICLEVRM": "personal_ids", "I-PIN": "personal_ids", "I-EMAIL": "email", "I-PHONENUMBER": "phone_number", "I-PHONE_NUMBER": "phone_number", "I-CREDITCARDCVV": "credit_card_info", "I-CREDITCARDNUMBER": "credit_card_info", "I-PASSWORD": "password", "I-IP": "network_info", "I-IPV4": "network_info", "I-IPV6": "network_info", "I-USERAGENT": "network_info", "I-MAC": "network_info", "I-URL": "url", "I-USERNAME": "username", "I-DISPLAYNAME": "username", "I-AMOUNT": "O", "I-CITY": "O", "I-COMPANYNAME": "O", "I-COMPANY_NAME": "O", "I-COUNTY": "O", "I-CREDITCARDISSUER": "O", "I-CURRENCY": "O", "I-CURRENCYCODE": "O", "I-CURRENCYNAME": "O", "I-CURRENCYSYMBOL": "O", "I-JOBAREA": "O", "I-JOBTITLE": "O", "I-JOBTYPE": "O", "I-JOBDESCRIPTOR": "O", "I-ORDINALDIRECTION": "O", "I-STATE": "O", "I-ZIPCODE": "O", "I-AGE": "O", "I-EYECOLOR": "O", "I-HEIGHT": "O", "I-TIME": "O", "I-PREFIX": "O", "I-SEX": "O", "I-GENDER": "O", "I-SEXTYPE": "O", "I-NUMBER": "O", "I-SUFFIX": "O"}, "original_model_ckpt": "lakshyakh93/deberta_finetuned_pii", "model_code": "def restrict_model_to_new_classes(model_ckpt, label2id: dict[str, int], encoding: str = \"IO\", re_use_weights: bool = True):\n  model = AutoModelForTokenClassification.from_pretrained(model_ckpt, token=HF_TOKEN)\n\n  # Create a new head with all the classes.\n  new_classifier = Linear(in_features=model.config.pooler_hidden_size, out_features=len(label2id), device=DEVICE)\n\n  if encoding == \"BIO\":\n    # Re-use weights for existing classes in the model\n    if re_use_weights:\n      new_classes = [class_ for class_ in label2id if class_ not in model.config.label2id]\n      existing_classes = [class_ for class_ in label2id if class_ in model.config.label2id]\n      for class_ in existing_classes:\n        class_old_idx = model.config.label2id[class_]\n        class_new_idx = label2id[class_]\n        with torch.no_grad():\n          new_classifier.weight[class_new_idx, ...] = Parameter(model.classifier.weight[class_old_idx, ...])\n      print(f\"Keeping {len(existing_classes)} dimensions from the {len(label2id)} given labels\")\n      print(f\"Didn't find weights for the labels {new_classes}\")\n    print()\n\n  elif encoding == \"IO\":\n    # TODO - any way to reuse the heads they provided ? By aggregating the B- and I- ? not sure\n    pass\n  else:\n    raise ValueError(\"Not supporting encoding \", encoding)\n\n  model.classifier = new_classifier\n  model.config.label2id = label2id\n  model.config.id2label = {new_id: class_ for class_, new_id in label2id.items()}\n\n  model.config.num_labels = len(label2id)\n  model.num_labels = len(label2id)\n\n  return model.to(DEVICE)\n", "dataset": "MLexperiments/pii-dataset-NER-production-test", "WandB_link": "https://wandb.ai/bogdan-gal/pii_ner_exp1/runs/lna6no2k", "model_save_ckpt": "MLexperiments/pii-NER-20240410-025921"}