File size: 1,608 Bytes
6a804e1
c6e6478
6a804e1
c6e6478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07067dd
c6e6478
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import streamlit as st
from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification


def pii_app():
    st.title('PII Data Detection')
    text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.')
    tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2)
    model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII")
    big_bird_classifier = pipeline(task="token-classification",
                                   model=model,
                                   aggregation_strategy="average",
                                   tokenizer=tokenizer)
    output = big_bird_classifier(text_input)
    st.header('List of Entities:')
    for entity in output:
        st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}")

    highlighted_text = highlight_pii(text_input, output)
    st.header('\nPII Detected Output:')
    st.markdown(highlighted_text, unsafe_allow_html=True)


def highlight_pii(text, entities):
    highlighted_text = text
    offset = 0
    for entity in entities:
        start_idx = entity["start"] + offset
        end_idx = entity["end"] + offset
        highlighted_text = (
                highlighted_text[:start_idx]
                + f'<span style="background-color: blue">{highlighted_text[start_idx:end_idx]}</span>'
                + highlighted_text[end_idx:]
        )
        offset += len('<span style="background-color: blue"></span>')  # Adjust offset for HTML tags
    return highlighted_text


if __name__ == "__main__":
    pii_app()