Seanyoon commited on
Commit
78a2900
1 Parent(s): f17c932

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from preprocess import preprocess_data
5
+
6
+ def anonymize_text(text):
7
+ model_name = "distilbert-base-uncased"
8
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
9
+ model = transformers.AutoModelForMaskedLM.from_pretrained(model_name)
10
+
11
+ input_ids = tokenizer.encode(text, return_tensors="pt")
12
+ mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
13
+
14
+ token_logits = model(input_ids)[0]
15
+ mask_token_logits = token_logits[0, mask_token_index, :]
16
+
17
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
18
+
19
+ anonymized_text = []
20
+ for token in top_5_tokens:
21
+ token = tokenizer.decode([token])
22
+ anonymized_text.append(token)
23
+
24
+ return anonymized_text
25
+
26
+ def run_app():
27
+ st.title("Text Anonymization App")
28
+
29
+ # File upload
30
+ st.subheader("Upload your data")
31
+ file = st.file_uploader("Upload CSV", type=["csv"])
32
+
33
+ if file is not None:
34
+ # Read the file
35
+ data = pd.read_csv(file)
36
+
37
+ # Preprocess the data
38
+ preprocessed_data = preprocess_data(data)
39
+
40
+ # Column selection
41
+ st.subheader("Select columns to anonymize")
42
+ selected_columns = []
43
+ for col in preprocessed_data.columns:
44
+ if st.checkbox(col):
45
+ selected_columns.append(col)
46
+
47
+ #