Spaces:
Running
Running
Delete text_analytics_wrapper.py
Browse files- text_analytics_wrapper.py +0 -123
text_analytics_wrapper.py
DELETED
@@ -1,123 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import List, Optional
|
3 |
-
import logging
|
4 |
-
import dotenv
|
5 |
-
from azure.ai.textanalytics import TextAnalyticsClient
|
6 |
-
from azure.core.credentials import AzureKeyCredential
|
7 |
-
|
8 |
-
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
|
9 |
-
from presidio_analyzer.nlp_engine import NlpArtifacts
|
10 |
-
|
11 |
-
logger = logging.getLogger("presidio-streamlit")
|
12 |
-
|
13 |
-
class TextAnalyticsWrapper(EntityRecognizer):
|
14 |
-
from azure.ai.textanalytics._models import PiiEntityCategory
|
15 |
-
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
|
16 |
-
|
17 |
-
def __init__(
|
18 |
-
self,
|
19 |
-
supported_entities: Optional[List[str]] = None,
|
20 |
-
supported_language: str = "en",
|
21 |
-
ta_client: Optional[TextAnalyticsClient] = None,
|
22 |
-
ta_key: Optional[str] = None,
|
23 |
-
ta_endpoint: Optional[str] = None,
|
24 |
-
):
|
25 |
-
"""
|
26 |
-
Wrapper for the Azure Text Analytics client
|
27 |
-
:param ta_client: object of type TextAnalyticsClient
|
28 |
-
:param ta_key: Azure cognitive Services for Language key
|
29 |
-
:param ta_endpoint: Azure cognitive Services for Language endpoint
|
30 |
-
"""
|
31 |
-
|
32 |
-
if not supported_entities:
|
33 |
-
supported_entities = self.TA_SUPPORTED_ENTITIES
|
34 |
-
|
35 |
-
super().__init__(
|
36 |
-
supported_entities=supported_entities,
|
37 |
-
supported_language=supported_language,
|
38 |
-
name="Azure Text Analytics PII",
|
39 |
-
)
|
40 |
-
|
41 |
-
self.ta_key = ta_key
|
42 |
-
self.ta_endpoint = ta_endpoint
|
43 |
-
|
44 |
-
if not ta_client:
|
45 |
-
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
|
46 |
-
self.ta_client = ta_client
|
47 |
-
|
48 |
-
@staticmethod
|
49 |
-
def __authenticate_client(key: str, endpoint: str):
|
50 |
-
ta_credential = AzureKeyCredential(key)
|
51 |
-
text_analytics_client = TextAnalyticsClient(
|
52 |
-
endpoint=endpoint, credential=ta_credential
|
53 |
-
)
|
54 |
-
return text_analytics_client
|
55 |
-
|
56 |
-
def analyze(
|
57 |
-
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
|
58 |
-
) -> List[RecognizerResult]:
|
59 |
-
if not entities:
|
60 |
-
entities = []
|
61 |
-
response = self.ta_client.recognize_pii_entities(
|
62 |
-
[text], language=self.supported_language
|
63 |
-
)
|
64 |
-
results = [doc for doc in response if not doc.is_error]
|
65 |
-
recognizer_results = []
|
66 |
-
for res in results:
|
67 |
-
for entity in res.entities:
|
68 |
-
if entity.category not in self.supported_entities:
|
69 |
-
continue
|
70 |
-
analysis_explanation = TextAnalyticsWrapper._build_explanation(
|
71 |
-
original_score=entity.confidence_score,
|
72 |
-
entity_type=entity.category,
|
73 |
-
)
|
74 |
-
recognizer_results.append(
|
75 |
-
RecognizerResult(
|
76 |
-
entity_type=entity.category,
|
77 |
-
start=entity.offset,
|
78 |
-
end=entity.offset + len(entity.text),
|
79 |
-
score=entity.confidence_score,
|
80 |
-
analysis_explanation=analysis_explanation,
|
81 |
-
)
|
82 |
-
)
|
83 |
-
|
84 |
-
return recognizer_results
|
85 |
-
|
86 |
-
@staticmethod
|
87 |
-
def _build_explanation(
|
88 |
-
original_score: float, entity_type: str
|
89 |
-
) -> AnalysisExplanation:
|
90 |
-
explanation = AnalysisExplanation(
|
91 |
-
recognizer=TextAnalyticsWrapper.__class__.__name__,
|
92 |
-
original_score=original_score,
|
93 |
-
textual_explanation=f"Identified as {entity_type} by Text Analytics",
|
94 |
-
)
|
95 |
-
return explanation
|
96 |
-
|
97 |
-
def load(self) -> None:
|
98 |
-
pass
|
99 |
-
|
100 |
-
|
101 |
-
if __name__ == "__main__":
|
102 |
-
import presidio_helpers
|
103 |
-
dotenv.load_dotenv()
|
104 |
-
text = """
|
105 |
-
Here are a few example sentences we currently support:
|
106 |
-
|
107 |
-
Hello, my name is David Johnson and I live in Maine.
|
108 |
-
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
|
109 |
-
|
110 |
-
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
|
111 |
-
|
112 |
-
My passport: 191280342 and my phone number: (212) 555-1234.
|
113 |
-
|
114 |
-
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
|
115 |
-
|
116 |
-
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
117 |
-
"""
|
118 |
-
analyzer = presidio_helpers.analyzer_engine(
|
119 |
-
model_path="Azure Text Analytics PII",
|
120 |
-
ta_key=os.environ["TA_KEY"],
|
121 |
-
ta_endpoint=os.environ["TA_ENDPOINT"],
|
122 |
-
)
|
123 |
-
analyzer.analyze(text=text, language="en")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|