File size: 9,587 Bytes
e8574b8
24e3585
e8574b8
 
c59464c
e8574b8
 
 
c59464c
 
 
 
e8574b8
5cda7f5
c59464c
 
 
5cda7f5
 
e8574b8
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
 
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
e8574b8
 
c59464c
 
 
18a08c1
 
c59464c
 
 
 
 
 
e8574b8
 
 
5cda7f5
 
 
e8574b8
 
 
 
 
 
 
 
 
 
 
5cda7f5
 
 
e8574b8
 
 
 
63a56b9
e8574b8
 
035dcbf
e8574b8
 
 
035dcbf
 
e8574b8
 
df65a41
63394b2
e8574b8
 
 
 
 
 
 
 
24e3585
e8574b8
24e3585
e8574b8
 
 
24e3585
e8574b8
 
 
 
24e3585
e8574b8
c59464c
e8574b8
4b92f32
c59464c
4b92f32
e8574b8
c59464c
4b92f32
e8574b8
4b92f32
 
e8574b8
 
 
 
 
c59464c
 
 
 
 
 
e8574b8
 
 
 
 
 
18a08c1
 
 
 
 
5cda7f5
 
 
 
 
63a56b9
5cda7f5
 
 
18a08c1
 
 
 
 
 
 
 
e8574b8
 
 
 
 
5cda7f5
e8574b8
 
 
 
 
5cda7f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8574b8
 
18a08c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

examples = [
    [
        "Pierre Dubois, résident de Paris, a fondé sa propre entreprise, Le Petit Café, située au 15 Rue de la Paix. Son numéro d'entreprise est FR-987654321-1, et il utilise le compte bancaire 9876543210 pour les transactions.",
        "person, organization, address, company registration number, bank account number",
        0.5,
        False,
    ],
    [
        "Leticia Ramírez, una habitante de Barcelona, tiene una cita médica programada en el Hospital General de Cataluña, situado en 10 Calle de los Ángeles. Su número de la seguridad social es ES-123456789-A y su grupo sanguíneo es AB+.",
        "person, location, address, social security number, blood type",
        0.5,
        False,
    ],
    [
        "John Smith, from London, teaches mathematics at Royal Academy located at 25 King’s Road. His employee ID is UK-987654-321 and he has been working there since 2015.",
        "person, profession, organization, address, employee ID number",
        0.5,
        False,
    ],
    [
        "In Frankfurt, Claudia Weber frequently visits her local bank branch, Deutsche Bank, at 48 Hauptstraße. Her account number is DE-1234567890123456, used primarily for her mortgage payments.",
        "person, location, address, bank account number",
        0.5,
        False,
    ],
    [
        "Marta Rossi, residente a Roma, ha acquistato un appartamento al 123 Via Condotti. Il numero di registrazione della proprietà è IT-654321-2018 e il mutuo è gestito tramite la Banca d'Italia con numero di conto 3216549870.",
        "person, address, property registration number, bank account number",
        0.5,
        False,
    ],
    [
        "Paulo Coelho, um turista do Brasil, fez um seguro de viagem com a empresa Seguros PT antes de sua viagem para Lisboa. O número da apólice é BR-987654321-123 e inclui cobertura médica.",
        "person, nationality, company, insurance policy number, coverage",
        0.5,
        False,
    ],
    [
        "Julia Fischer, eine Kundin aus München, hat bei der BayWa AG, einem großen Anbieter von Baustoffen mit Sitz am 77 Industriestraße, einen Kredit aufgenommen. Die Kreditnummer lautet DE-12345678.",
        "person, city, organization, address, loan number",
        0.5,
        False,
    ],
    [
        "Carlos Sánchez, profesor en la Universidad de Madrid, reside en el 5 Calle de Alcalá. Su número de identificación de profesor es ES-192837465 y tiene un doctorado en filosofía.",
        "person, profession, address, teacher ID number, degree",
        0.5,
        False,
    ],
    [
        "Sophie Dupont, une journaliste française, travaille pour Le Monde, basé au 33 rue des Écoles à Paris. Son numéro d'identification de presse est FR-75649023.",
        "person, profession, organization, address, press ID number",
        0.5,
        False,
    ],
    [
        "Manuel Oliveira, um agricultor em Porto, possui uma grande plantação de vinhas na Rua da Estrada, 120. O número de registro agrícola é PT-5678912345.",
        "person, profession, address, agricultural registration number",
        0.5,
        False,
    ],
    [
        "Elisa Müller, eine Künstlerin aus Berlin, hat ihre neueste Skulptur im öffentlichen Park am Alexanderplatz ausgestellt. Ihre Künstlernummer lautet DE-112233445.",
        "person, profession, location, artist ID number",
        0.5,
        False,
    ],
    [
        "Federico García, un jugador de fútbol de Sevilla, ha firmado un contrato de tres años con el club Real Betis. Su número de licencia deportiva es ES-9876543210.",
        "person, profession, organization, sports license number",
        0.5,
        False,
    ],
    [
        "Sarah White, a London-based actress, will be performing in 'Hamlet' at the Globe Theatre located at 21 New Globe Walk. Her Equity membership number is UK-1234567.",
        "person, profession, location, address, membership number",
        0.5,
        False,
    ],
    [
        "Ricardo Mello, engenheiro civil, trabalha na construção da nova barragem no Rio Douro, Portugal. Seu número de registro profissional é PT-987654321.",
        "person, profession, project location, professional registration number",
        0.5,
        False,
    ],
    [
        "Giuseppe Conti, un cliente di Milano, ha fatto un acquisto presso il negozio La Rinascente situato in Piazza Duomo. Il numero della sua carta di credito è IT-4567891234567891.",
        "person, location, address, credit card number",
        0.5,
        False,
    ]
]


def ner(
    text, labels: str, threshold: float, nested_ner: bool
) -> Dict[str, Union[str, int, float]]:
    labels = labels.split(",")
    return {
        "text": text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in model.predict_entities(
                text, labels, flat_ner=not nested_ner, threshold=threshold
            )
        ],
    }


with gr.Blocks(title="GLiNER-M-v2.1") as demo:
    gr.Markdown(
        """
        # GLiNER-PII (Personnally Identifiable Information extraction)

        GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios.

        The model has been trained by fine-tuning urchade/gliner_multi-v2.1 on the urchade/synthetic-pii-ner-mistral-v1 dataset.
        
        ## Links

        * Model: https://huggingface.co/urchade/gliner_multi_pii-v1
        * All GLiNER models: https://huggingface.co/models?library=gliner
        * Paper: https://arxiv.org/abs/2311.08526
        * Repository: https://github.com/urchade/GLiNER
        """
    )
    with gr.Accordion("How to run this model locally", open=False):
        gr.Markdown(
            """
            ## Installation
            To use this model, you must install the GLiNER Python library:
            ```
            !pip install gliner
            ```
         
            ## Usage
            Once you've downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using `GLiNER.from_pretrained` and predict entities with `predict_entities`.
            """
        )
        gr.Code(
            '''
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

text = """
Harilala Rasoanaivo, un homme d'affaires local d'Antananarivo, a enregistré une nouvelle société nommée "Rasoanaivo Enterprises" au Lot II M 92 Antohomadinika. Son numéro est le +261 32 22 345 67, et son adresse électronique est harilala.rasoanaivo@telma.mg. Il a fourni son numéro de sécu 501-02-1234 pour l'enregistrement.
"""

labels = ["work", "booking number", "personally identifiable information", "driver licence", "person", "book", "full address", "company", "actor", "character", "email", "passport number", "Social Security Number", "phone number"]
entities = model.predict_entities(text, labels)

for entity in entities:
    print(entity["text"], "=>", entity["label"])
            ''',
            language="python",
        )
        gr.Code(
            """
Harilala Rasoanaivo => person
Rasoanaivo Enterprises => company
Lot II M 92 Antohomadinika => full address
+261 32 22 345 67 => phone number
harilala.rasoanaivo@telma.mg => email
501-02-1234 => Social Security Number
            """
        )

    input_text = gr.Textbox(
        value=examples[0][0], label="Text input", placeholder="Enter your text here"
    )
    with gr.Row() as row:
        labels = gr.Textbox(
            value=examples[0][1],
            label="Labels",
            placeholder="Enter your labels here (comma separated)",
            scale=2,
        )
        threshold = gr.Slider(
            0,
            1,
            value=0.3,
            step=0.01,
            label="Threshold",
            info="Lower the threshold to increase how many entities get predicted.",
            scale=1,
        )
        nested_ner = gr.Checkbox(
            value=examples[0][2],
            label="Nested NER",
            info="Allow for nested NER?",
            scale=0,
        )
    output = gr.HighlightedText(label="Predicted Entities")
    submit_btn = gr.Button("Submit")
    examples = gr.Examples(
        examples,
        fn=ner,
        inputs=[input_text, labels, threshold, nested_ner],
        outputs=output,
        cache_examples=True,
    )

    # Submitting
    input_text.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    labels.submit(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    threshold.release(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    submit_btn.click(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )
    nested_ner.change(
        fn=ner, inputs=[input_text, labels, threshold, nested_ner], outputs=output
    )

demo.queue()
demo.launch(debug=True)