import streamlit as st from transformers import pipeline, BigBirdTokenizerFast, AutoModelForTokenClassification def pii_app(): st.title('PII Data Detection') text_input = st.text_area('Enter a Paragraph below to get list of PII in your text.') tokenizer = BigBirdTokenizerFast.from_pretrained("google/bigbird-roberta-base", block_size=2) model = AutoModelForTokenClassification.from_pretrained("vedantM/BigBird-PII") big_bird_classifier = pipeline(task="token-classification", model=model, aggregation_strategy="average", tokenizer=tokenizer) output = big_bird_classifier(text_input) st.header('List of Entities:') for entity in output: st.write(f"Entity: {entity['word']}, Type: {entity['entity_group']}") highlighted_text = highlight_pii(text_input, output) st.header('\nPII Detected Output:') st.markdown(highlighted_text, unsafe_allow_html=True) def highlight_pii(text, entities): highlighted_text = text offset = 0 for entity in entities: start_idx = entity["start"] + offset end_idx = entity["end"] + offset highlighted_text = ( highlighted_text[:start_idx] + f'{highlighted_text[start_idx:end_idx]}' + highlighted_text[end_idx:] ) offset += len('') # Adjust offset for HTML tags return highlighted_text if __name__ == "__main__": pii_app()