Farnazgh commited on
Commit
79d722e
1 Parent(s): 74ccb05

add new transformers model for french + update entities

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Anonymizer demo
3
- emoji: 👁
4
  colorFrom: gray
5
  colorTo: gray
6
  sdk: streamlit
@@ -9,4 +9,4 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Aliae Anonymizer
3
+ emoji: 😻
4
  colorFrom: gray
5
  colorTo: gray
6
  sdk: streamlit
 
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/presidio_helpers.cpython-310.pyc CHANGED
Binary files a/__pycache__/presidio_helpers.cpython-310.pyc and b/__pycache__/presidio_helpers.cpython-310.pyc differ
 
__pycache__/presidio_nlp_engine_config.cpython-310.pyc CHANGED
Binary files a/__pycache__/presidio_nlp_engine_config.cpython-310.pyc and b/__pycache__/presidio_nlp_engine_config.cpython-310.pyc differ
 
__pycache__/transformers_class.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
app.py CHANGED
@@ -56,7 +56,7 @@ st_ta_key = st_ta_endpoint = ""
56
 
57
  model_list = [
58
  "spaCy/en_core_web_lg",
59
- "spaCy/fr_core_news_md",
60
  ]
61
  # "flair/ner-english-large",
62
  #
@@ -78,7 +78,7 @@ lang = st.sidebar.selectbox(
78
 
79
  # Extract model package.
80
  # st_model_package = st_model.split("/")[0]
81
- st_model_package = 'spaCy'
82
 
83
  # # Remove package prefix (if needed)
84
  # st_model = (
@@ -87,8 +87,14 @@ st_model_package = 'spaCy'
87
  # else "/".join(st_model.split("/")[1:])
88
  # )
89
  st_model = 'en_core_web_lg'
90
- if lang =='en': st_model = 'en_core_web_lg'
91
- elif lang == 'fr' : st_model = 'fr_core_news_md'
 
 
 
 
 
 
92
 
93
  # if st_model == "Other":
94
  # st_model_package = st.sidebar.selectbox(
 
56
 
57
  model_list = [
58
  "spaCy/en_core_web_lg",
59
+ "spaCy/fr_core_news_lg",
60
  ]
61
  # "flair/ner-english-large",
62
  #
 
78
 
79
  # Extract model package.
80
  # st_model_package = st_model.split("/")[0]
81
+
82
 
83
  # # Remove package prefix (if needed)
84
  # st_model = (
 
87
  # else "/".join(st_model.split("/")[1:])
88
  # )
89
  st_model = 'en_core_web_lg'
90
+ st_model_package = "spaCy"
91
+
92
+ if lang =='en':
93
+ st_model_package = "spaCy"
94
+ st_model = 'en_core_web_lg'
95
+ elif lang == 'fr' :
96
+ st_model_package = "HuggingFace"
97
+ st_model = 'fr_core_news_lg'
98
 
99
  # if st_model == "Other":
100
  # st_model_package = st.sidebar.selectbox(
presidio_helpers.py CHANGED
@@ -24,7 +24,7 @@ from presidio_anonymizer.entities import OperatorConfig
24
  from presidio_nlp_engine_config import (
25
  create_nlp_engine_with_spacy,
26
  # create_nlp_engine_with_flair,
27
- # create_nlp_engine_with_transformers,
28
  # create_nlp_engine_with_azure_text_analytics,
29
  )
30
 
@@ -99,6 +99,7 @@ def get_supported_entities(
99
  # model_family, model_path, ta_key, ta_endpoint
100
  # ).get_supported_entities() + ["GENERIC_PII"]
101
  return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
 
102
 
103
 
104
  @st.cache_data
 
24
  from presidio_nlp_engine_config import (
25
  create_nlp_engine_with_spacy,
26
  # create_nlp_engine_with_flair,
27
+ create_nlp_engine_with_transformers,
28
  # create_nlp_engine_with_azure_text_analytics,
29
  )
30
 
 
99
  # model_family, model_path, ta_key, ta_endpoint
100
  # ).get_supported_entities() + ["GENERIC_PII"]
101
  return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
102
+ #
103
 
104
 
105
  @st.cache_data
presidio_nlp_engine_config.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
  from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 
6
 
7
  logger = logging.getLogger("presidio-streamlit")
8
 
@@ -34,108 +35,96 @@ def create_nlp_engine_with_spacy(
34
  return nlp_engine, registry
35
 
36
 
37
- # def create_nlp_engine_with_transformers(
38
- # model_path: str,
39
- # ) -> Tuple[NlpEngine, RecognizerRegistry]:
40
- # """
41
- # Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
42
- # The TransformersRecognizer would return results from Transformers models, the spaCy model
43
- # would return NlpArtifacts such as POS and lemmas.
44
- # :param model_path: HuggingFace model path.
45
- # """
46
- #
47
- # from transformers_rec import (
48
- # STANFORD_COFIGURATION,
49
- # BERT_DEID_CONFIGURATION,
50
- # TransformersRecognizer,
51
- # )
52
- #
53
- # registry = RecognizerRegistry()
54
- # registry.load_predefined_recognizers()
55
- #
56
- # if not spacy.util.is_package("en_core_web_sm"):
57
- # spacy.cli.download("en_core_web_sm")
58
- # # Using a small spaCy model + a HF NER model
59
- # transformers_recognizer = TransformersRecognizer(model_path=model_path)
60
- #
61
- # if model_path == "StanfordAIMI/stanford-deidentifier-base":
62
- # transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
63
- # elif model_path == "obi/deid_roberta_i2b2":
64
- # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
65
- # else:
66
- # print(f"Warning: Model has no configuration, loading default.")
67
- # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
68
- #
69
- # # Use small spaCy model, no need for both spacy and HF models
70
- # # The transformers model is used here as a recognizer, not as an NlpEngine
71
- # nlp_configuration = {
72
- # "nlp_engine_name": "spacy",
73
- # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
74
- # }
75
- #
76
- # registry.add_recognizer(transformers_recognizer)
77
- # registry.remove_recognizer("SpacyRecognizer")
78
- #
79
- # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
80
- #
81
- # return nlp_engine, registry
82
-
83
-
84
- # def create_nlp_engine_with_flair(
85
- # model_path: str,
86
- # ) -> Tuple[NlpEngine, RecognizerRegistry]:
87
- # """
88
- # Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
89
- # The FlairRecognizer would return results from Flair models, the spaCy model
90
- # would return NlpArtifacts such as POS and lemmas.
91
- # :param model_path: Flair model path.
92
- # """
93
- # from flair_recognizer import FlairRecognizer
94
- #
95
- # registry = RecognizerRegistry()
96
- # registry.load_predefined_recognizers()
97
- #
98
- # if not spacy.util.is_package("en_core_web_sm"):
99
- # spacy.cli.download("en_core_web_sm")
100
- # # Using a small spaCy model + a Flair NER model
101
- # flair_recognizer = FlairRecognizer(model_path=model_path)
102
- # nlp_configuration = {
103
- # "nlp_engine_name": "spacy",
104
- # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
105
- # }
106
- # registry.add_recognizer(flair_recognizer)
107
- # registry.remove_recognizer("SpacyRecognizer")
108
- #
109
- # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
110
- #
111
- # return nlp_engine, registry
112
-
113
-
114
- # def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
115
- # """
116
- # Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
117
- # The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
118
- # would return NlpArtifacts such as POS and lemmas.
119
- # :param ta_key: Azure Text Analytics key.
120
- # :param ta_endpoint: Azure Text Analytics endpoint.
121
- # """
122
- # from text_analytics_wrapper import TextAnalyticsWrapper
123
- #
124
- # if not ta_key or not ta_endpoint:
125
- # raise RuntimeError("Please fill in the Text Analytics endpoint details")
126
- #
127
- # registry = RecognizerRegistry()
128
- # registry.load_predefined_recognizers()
129
- #
130
- # ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
131
- # nlp_configuration = {
132
- # "nlp_engine_name": "spacy",
133
- # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
134
- # }
135
- #
136
- # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
137
- #
138
- # registry.add_recognizer(ta_recognizer)
139
- # registry.remove_recognizer("SpacyRecognizer")
140
- #
141
- # return nlp_engine, registry
 
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
  from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
6
+ from transformers_class import TransformerRecognizer
7
 
8
  logger = logging.getLogger("presidio-streamlit")
9
 
 
35
  return nlp_engine, registry
36
 
37
 
38
+ def create_nlp_engine_with_transformers(
39
+ model_path: str,
40
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
41
+ """
42
+ Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
43
+ The TransformersRecognizer would return results from Transformers models, the spaCy model
44
+ would return NlpArtifacts such as POS and lemmas.
45
+ :param model_path: HuggingFace model path.
46
+ """
47
+
48
+
49
+
50
+
51
+ # if not spacy.util.is_package("en_core_web_sm"):
52
+ # spacy.cli.download("en_core_web_sm")
53
+ # # Using a small spaCy model + a HF NER model
54
+ # transformers_recognizer = TransformersRecognizer(model_path=model_path)
55
+ #
56
+ # if model_path == "StanfordAIMI/stanford-deidentifier-base":
57
+ # transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
58
+ # elif model_path == "obi/deid_roberta_i2b2":
59
+ # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
60
+ # else:
61
+ # print(f"Warning: Model has no configuration, loading default.")
62
+ # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
63
+
64
+ # Use small spaCy model, no need for both spacy and HF models
65
+ # The transformers model is used here as a recognizer, not as an NlpEngine
66
+ if not spacy.util.is_package(model_path):
67
+ spacy.cli.download(model_path)
68
+
69
+ nlp_configuration = {
70
+ "nlp_engine_name": "spacy",
71
+ "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
72
+ }
73
+
74
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
75
+ registry = RecognizerRegistry()
76
+ registry = load_predefined_recognizers(registry)
77
+
78
+ mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
79
+ model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
80
+ transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)
81
+
82
+ registry.add_recognizer(transformers_recognizer)
83
+ registry.remove_recognizer("SpacyRecognizer")
84
+
85
+
86
+
87
+ return nlp_engine, registry
88
+
89
+
90
+
91
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
92
+ import phonenumbers
93
+
94
+ def load_predefined_recognizers(registry, lang='fr'):
95
+ # phone number
96
+ phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
97
+ registry.add_recognizer(phone_recognizer_fr)
98
+
99
+ # email
100
+ email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
101
+ registry.add_recognizer(email_recognizer_fr)
102
+
103
+ # credit card
104
+ creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
105
+ registry.add_recognizer(creditcard_recognizer_fr)
106
+
107
+ # crypto
108
+ crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
109
+ registry.add_recognizer(crypto_recognizer_fr)
110
+
111
+ # date time
112
+ date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
113
+ registry.add_recognizer(date_recognizer_fr)
114
+
115
+ # ip address
116
+ ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
117
+ registry.add_recognizer(ip_recognizer_fr)
118
+
119
+ # iban
120
+ iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
121
+ registry.add_recognizer(iban_recognizer_fr)
122
+
123
+ # URL
124
+ url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
125
+ registry.add_recognizer(url_recognizer_fr)
126
+
127
+ # load from yaml
128
+ registry.add_recognizers_from_yaml("recognizers.yaml")
129
+
130
+ return registry
 
 
 
 
 
 
 
 
 
 
 
 
recognizers.yaml CHANGED
@@ -1,15 +1,15 @@
1
  recognizers:
2
- -
3
- name: "FRENCH_NID"
4
- supported_language: "fr"
5
- patterns:
6
- -
7
- name: "FRENCH_NID"
8
- regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
9
- score: 0.5
10
- context:
11
- - national
12
- supported_entity: "FRENCH_NID"
13
  -
14
  name: "FRENCH_NID"
15
  supported_language: "en"
 
1
  recognizers:
2
+ # -
3
+ # name: "FRENCH_NID"
4
+ # supported_language: "fr"
5
+ # patterns:
6
+ # -
7
+ # name: "FRENCH_NID"
8
+ # regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
9
+ # score: 0.5
10
+ # context:
11
+ # - national
12
+ # supported_entity: "FRENCH_NID"
13
  -
14
  name: "FRENCH_NID"
15
  supported_language: "en"
requirements.txt CHANGED
@@ -7,7 +7,5 @@ python-dotenv
7
  st-annotated-text
8
  torch
9
  transformers
10
- flair
11
- openai
12
  spacy
13
  azure-ai-textanalytics
 
7
  st-annotated-text
8
  torch
9
  transformers
 
 
10
  spacy
11
  azure-ai-textanalytics
transformers_class.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from presidio_analyzer import (
3
+ RecognizerResult,
4
+ EntityRecognizer,
5
+ AnalysisExplanation,
6
+ )
7
+ from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
8
+
9
+ class TransformerRecognizer(EntityRecognizer):
10
+ def __init__(
11
+ self,
12
+ model_id_or_path,
13
+ mapping_labels,
14
+ aggregation_strategy="simple",
15
+ supported_language="fr",
16
+ ignore_labels=["O", "MISC"],
17
+ ):
18
+ # inits transformers pipeline for given mode or path
19
+ self.pipeline = pipeline(
20
+ "token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
21
+ )
22
+ # map labels to presidio labels
23
+ self.label2presidio = mapping_labels
24
+
25
+ # passes entities from model into parent class
26
+ super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)
27
+
28
+ def load(self) -> None:
29
+ """No loading is required."""
30
+ pass
31
+
32
+ def analyze(
33
+ self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None
34
+ ):
35
+ """
36
+ Extracts entities using Transformers pipeline
37
+ """
38
+ results = []
39
+
40
+ predicted_entities = self.pipeline(text)
41
+ if len(predicted_entities) > 0:
42
+ for e in predicted_entities:
43
+ if(e['entity_group'] not in self.label2presidio):
44
+ continue
45
+ converted_entity = self.label2presidio[e["entity_group"]]
46
+ if converted_entity in entities or entities is None:
47
+ results.append(
48
+ RecognizerResult(
49
+ entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
50
+ )
51
+ )
52
+ return results